From b41be28272c180c8812ff3355aef5c995950c82d Mon Sep 17 00:00:00 2001
From: Dennis Bader <dennis.bader@gmx.ch>
Date: Fri, 20 Sep 2024 18:29:44 +0200
Subject: [PATCH] Feat/metrics quantiles (#2530)

---
 CHANGELOG.md                                  |   6 +-
 darts/ad/anomaly_model/forecasting_am.py      |   5 +
 darts/metrics/__init__.py                     |  12 +-
 darts/metrics/metrics.py                      | 818 ++++++++++++++----
 darts/models/forecasting/forecasting_model.py |  93 +-
 darts/models/forecasting/regression_model.py  |  16 +-
 darts/tests/metrics/test_metrics.py           | 600 ++++++++++++-
 .../models/forecasting/test_backtesting.py    | 174 ++++
 .../models/forecasting/test_residuals.py      | 265 ++++++
 darts/tests/utils/test_utils.py               |  46 +-
 darts/timeseries.py                           |  16 +-
 darts/utils/likelihood_models.py              |  15 +-
 darts/utils/utils.py                          |  62 +-
 13 files changed, 1929 insertions(+), 199 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8fa74e021d..66bf84b146 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,10 +11,14 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 
 **Improved**
 
-- Added `IQRDetector`, that allows to detect anomalies using the interquartile range algorithm. [#2441] by [Igor Urbanik](https://github.com/u8-igor).
+- Added `IQRDetector`, that allows to detect anomalies using the interquartile range algorithm. [#2441](https://github.com/unit8co/darts/issues/2441) by [Igor Urbanik](https://github.com/u8-igor).
 - Added hyperparameters controlling the hidden layer sizes for the feature encoders in `TiDEModel`. [#2408](https://github.com/unit8co/darts/issues/2408) by [eschibli](https://github.com/eschibli).
 - Added hyperparameter `activation` to `BlockRNNModel` to specify the activation function in case of a multi-layer output network. [#2408](https://github.com/unit8co/darts/issues/2408) by [eschibli](https://github.com/eschibli).
 - Added support for broadcasting to TimeSeries on component and sample level. [#2476](https://https://github.com/unit8co/darts/pull/2476) by [Joel L.](https://github.com/Joelius300).
+- Added support for computing metrics, backtest, and residuals on one or multiple quantiles `q`, either from probabilistic predictions or predicted quantiles. [#2530](https://github.com/unit8co/darts/issues/2530) by [Dennis Bader](https://github.com/dennisbader).
+- Added quantile interval metrics: `miw` (Mean Interval Width, time aggregated) and `iw` (Interval Width, per time step / non-aggregated) which compute the width of quantile intervals `q_intervals` (expected to be a tuple or sequence of tuples with (lower quantile, upper quantile). [#2530](https://github.com/unit8co/darts/issues/2530) by [Dennis Bader](https://github.com/dennisbader).
+- Added property `TimeSeries.shape` to get the shape of the time series. [#2530](https://github.com/unit8co/darts/issues/2530) by [Dennis Bader](https://github.com/dennisbader).
+- Added support for parameters `enable_optimization` and `predict_likelihood_parameters` to the forecasting models' `backtest()` and `residuals()` methods. [#2530](https://github.com/unit8co/darts/issues/2530) by [Dennis Bader](https://github.com/dennisbader).
 - Helper function `darts.utils.utils.generate_index()` now accepts datetime strings as `start` and `end` parameters to generate the pandas DatetimeIndex. [#2522](https://github.com/unit8co/darts/pull/2522) by [Dennis Bader](https://github.com/dennisbader).
 - Various improvements in the documentation:
   - Made README's forecasting model support table more colorblind-friendly. [#2433](https://github.com/unit8co/darts/pull/2433)
diff --git a/darts/ad/anomaly_model/forecasting_am.py b/darts/ad/anomaly_model/forecasting_am.py
index a70c5b21bb..fd3eb9a33a 100644
--- a/darts/ad/anomaly_model/forecasting_am.py
+++ b/darts/ad/anomaly_model/forecasting_am.py
@@ -132,6 +132,7 @@ def fit(
             `train_length`.
         enable_optimization
             Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
         model_fit_kwargs
             Parameters to be passed on to the forecast model `fit()` method.
 
@@ -212,6 +213,7 @@ def score(
             `train_length`.
         enable_optimization
             Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
         return_model_prediction
             Whether to return the forecasting model prediction along with the anomaly scores.
 
@@ -299,6 +301,7 @@ def predict_series(
             `train_length`.
         enable_optimization
             Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
 
         Returns
         -------
@@ -394,6 +397,7 @@ def eval_metric(
             `train_length`.
         enable_optimization
             Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
         metric
             The name of the metric function to use. Must be one of "AUC_ROC" (Area Under the
             Receiver Operating Characteristic Curve) and "AUC_PR" (Average Precision from scores).
@@ -499,6 +503,7 @@ def show_anomalies(
             `train_length`.
         enable_optimization
             Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
         anomalies
             The ground truth of the anomalies (1 if it is an anomaly and 0 if not).
         names_of_scorers
diff --git a/darts/metrics/__init__.py b/darts/metrics/__init__.py
index 7a8cb445da..2d0b5aaa2e 100644
--- a/darts/metrics/__init__.py
+++ b/darts/metrics/__init__.py
@@ -2,7 +2,10 @@
 Metrics
 -------
 
-For deterministic forecasts (point predictions with `num_samples == 1`):
+For deterministic forecasts (point predictions with `num_samples == 1`), probabilistic forecasts (`num_samples > 1`),
+and quantile forecasts. For probablistic and quantile forecasts, use parameter `q` to define the quantile(s) to
+compute the deterministic metrics on:
+
     - Aggregated over time:
         Absolute metrics:
             - :func:`MERR <darts.metrics.metrics.merr>`: Mean Error
@@ -42,8 +45,10 @@
     - Aggregated over time:
         - :func:`MQL <darts.metrics.metrics.mql>`: Mean Quantile Loss
         - :func:`QR <darts.metrics.metrics.qr>`: Quantile Risk
+        - :func:`MIW <darts.metrics.metrics.miw>`: Mean Interval Width
     - Per time step:
         - :func:`QL <darts.metrics.metrics.ql>`: Quantile Loss
+        - :func:`IW <darts.metrics.metrics.iw>`: Interval Width
 
 For Dynamic Time Warping (DTW) (aggregated over time):
     - :func:`DTW <darts.metrics.metrics.dtw_metric>`: Dynamic Time Warping Metric
@@ -57,11 +62,13 @@
     coefficient_of_variation,
     dtw_metric,
     err,
+    iw,
     mae,
     mape,
     marre,
     mase,
     merr,
+    miw,
     mql,
     mse,
     msse,
@@ -90,6 +97,7 @@
     se,
     sle,
     sse,
+    iw,
 }
 
 __all__ = [
@@ -120,4 +128,6 @@
     "sle",
     "smape",
     "sse",
+    "iw",
+    "miw",
 ]
diff --git a/darts/metrics/metrics.py b/darts/metrics/metrics.py
index 40a7e6e15e..1c667a7eb3 100644
--- a/darts/metrics/metrics.py
+++ b/darts/metrics/metrics.py
@@ -8,19 +8,27 @@
 import inspect
 from functools import wraps
 from inspect import signature
-from typing import Callable, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import pandas as pd
 
 from darts import TimeSeries
 from darts.dataprocessing import dtw
 from darts.logging import get_logger, raise_log
-from darts.utils import _build_tqdm_iterator, _parallel_apply, n_steps_between
 from darts.utils.ts_utils import SeriesType, get_series_seq_type, series2seq
+from darts.utils.utils import (
+    _build_tqdm_iterator,
+    _parallel_apply,
+    likelihood_component_names,
+    n_steps_between,
+    quantile_names,
+)
 
 logger = get_logger(__name__)
 TIME_AX = 0
 COMP_AX = 1
+SMPL_AX = 2
 
 # Note: for new metrics added to this module to be able to leverage the two decorators, it is required both having
 # the `actual_series` and `pred_series` parameters, and not having other ``Sequence`` as args (since these decorators
@@ -33,6 +41,51 @@
 ]
 
 
+def interval_support(func) -> Callable[..., METRIC_OUTPUT_TYPE]:
+    """
+    This decorator adds support for quantile interval metrics with sanity checks, processing, and extraction of
+    quantiles from the intervals.
+    """
+
+    @wraps(func)
+    def wrapper_interval_support(*args, **kwargs):
+        q = kwargs.get("q")
+        if q is not None:
+            raise_log(
+                ValueError(
+                    "`q` is not supported for quantile interval metrics; use `q_interval` instead."
+                )
+            )
+        q_interval = kwargs.get("q_interval")
+        if q_interval is None:
+            raise_log(
+                ValueError("Quantile interval metrics require setting `q_interval`.")
+            )
+        if isinstance(q_interval, tuple):
+            q_interval = [q_interval]
+        q_interval = np.array(q_interval)
+        if not q_interval.ndim == 2 or q_interval.shape[1] != 2:
+            raise_log(
+                ValueError(
+                    "`q_interval` must be a tuple (float, float) or a sequence of tuples (float, float)."
+                ),
+                logger=logger,
+            )
+        if not np.all(q_interval[:, 1] - q_interval[:, 0] > 0):
+            raise_log(
+                ValueError(
+                    "all intervals in `q_interval` must be tuples of (lower q, upper q) with `lower q > upper q`. "
+                    f"Received `q_interval={q_interval}`"
+                ),
+                logger=logger,
+            )
+        kwargs["q_interval"] = q_interval
+        kwargs["q"] = np.sort(np.unique(q_interval))
+        return func(*args, **kwargs)
+
+    return wrapper_interval_support
+
+
 def multi_ts_support(func) -> Callable[..., METRIC_OUTPUT_TYPE]:
     """
     This decorator further adapts the metrics that took as input two (or three for scaled metrics with `insample`)
@@ -60,12 +113,7 @@ def wrapper_multi_ts_support(*args, **kwargs):
 
         params = signature(func).parameters
         n_jobs = kwargs.pop("n_jobs", params["n_jobs"].default)
-        if not isinstance(n_jobs, int):
-            raise_log(ValueError("n_jobs must be an integer"), logger=logger)
-
         verbose = kwargs.pop("verbose", params["verbose"].default)
-        if not isinstance(verbose, bool):
-            raise_log(ValueError("verbose must be a bool"), logger=logger)
 
         # sanity check reduction functions
         _ = _get_reduction(
@@ -131,6 +179,38 @@ def wrapper_multi_ts_support(*args, **kwargs):
             num_series_in_args += int("insample" not in kwargs)
             kwargs.pop("insample", 0)
 
+        # handle `q` (quantile) parameter for probabilistic (or quantile) forecasts
+        if "q" in params:
+            # convert `q` to tuple of (quantile values, optional quantile component names)
+            q = kwargs.get("q", params["q"].default)
+            q_comp_names = None
+            if q is None:
+                kwargs["q"] = None
+            else:
+                if isinstance(q, tuple):
+                    q, q_comp_names = q
+                if isinstance(q, float):
+                    q = np.array([q])
+                else:
+                    q = np.array(q)
+
+                if not np.all(q[1:] - q[:-1] > 0.0):
+                    raise_log(
+                        ValueError(
+                            "`q` must be of type `float`, or a sequence of increasing order with unique values only. "
+                            f"Received `q={q}`."
+                        ),
+                        logger=logger,
+                    )
+                if not np.all(q >= 0.0) & np.all(q <= 1.0):
+                    raise_log(
+                        ValueError(
+                            f"All `q` values must be in the range `(>=0,<=1)`. Received `q={q}`."
+                        ),
+                        logger=logger,
+                    )
+                kwargs["q"] = (q, q_comp_names)
+
         iterator = _build_tqdm_iterator(
             iterable=zip(*input_series),
             verbose=verbose,
@@ -187,20 +267,61 @@ def wrapper_multivariate_support(*args, **kwargs) -> METRIC_OUTPUT_TYPE:
         pred_series = args[1]
         num_series_in_args = 2
 
-        if actual_series.width != pred_series.width:
-            raise_log(
-                ValueError(
-                    f"Mismatch between number of components in `actual_series` "
-                    f"(n={actual_series.width}) and `pred_series` (n={pred_series.width}."
-                ),
-                logger=logger,
-            )
+        q, q_comp_names = kwargs.get("q"), None
+        if q is None:
+            # without quantiles, the number of components must match
+            if actual_series.n_components != pred_series.n_components:
+                raise_log(
+                    ValueError(
+                        f"Mismatch between number of components in `actual_series` "
+                        f"(n={actual_series.width}) and `pred_series` (n={pred_series.width})."
+                    ),
+                    logger=logger,
+                )
+            # compute median for stochastic predictions
+            if pred_series.is_stochastic:
+                q = np.array([0.5])
+        else:
+            # `q` is required to be a tuple (handled by `multi_ts_support` wrapper)
+            if not isinstance(q, tuple) or not len(q) == 2:
+                raise_log(
+                    ValueError(
+                        "`q` must be of tuple of `(np.ndarray, Optional[pd.Index])` "
+                        "where the (quantile values, optioanl quantile component names). "
+                        f"Received `q={q}`."
+                    ),
+                    logger=logger,
+                )
+            q, q_comp_names = q
+            if not pred_series.is_stochastic:
+                # quantile component names are required if the predictions are not stochastic (as for stocahstic
+                # predictions, the quantiles can be retrieved from the sample dimension for each component)
+                if q_comp_names is None:
+                    q_comp_names = pd.Index(
+                        likelihood_component_names(
+                            components=actual_series.components,
+                            parameter_names=quantile_names(q=q),
+                        )
+                    )
+                if not q_comp_names.isin(pred_series.components).all():
+                    raise_log(
+                        ValueError(
+                            f"Computing a metric with quantile(s) `q={q}` is only supported for probabilistic "
+                            f"`pred_series` (num samples > 1) or `pred_series` containing the predicted "
+                            f"quantiles as columns / components. Either pass a probabilistic `pred_series` or "
+                            f"a series containing the expected quantile components: {q_comp_names.tolist()} "
+                        ),
+                        logger=logger,
+                    )
+
+        if "q" in params:
+            kwargs["q"] = (q, q_comp_names)
 
         # handle `insample` parameters for scaled metrics
         input_series = (actual_series, pred_series)
         if "insample" in params:
             insample = args[2]
-            if actual_series.width != insample.width:
+            if actual_series.n_components != insample.n_components:
                 raise_log(
                     ValueError(
                         f"Mismatch between number of components in `actual_series` "
@@ -212,15 +333,18 @@ def wrapper_multivariate_support(*args, **kwargs) -> METRIC_OUTPUT_TYPE:
             num_series_in_args += 1
 
         vals = func(*input_series, *args[num_series_in_args:], **kwargs)
-        if not 1 <= len(vals.shape) <= 2:
+        # bring vals to shape (n_time, n_comp, n_quantile)
+        if not 2 <= len(vals.shape) <= 3:
             raise_log(
                 ValueError(
-                    "Metric output must have 1 dimension for aggregated metrics (e.g. `mae()`, ...), "
-                    "or 2 dimension for time dependent metrics (e.g. `ae()`, ...)"
+                    "Metric output must have 2 dimensions (n components, n quantiles) "
+                    "for aggregated metrics (e.g. `mae()`, ...), "
+                    "or 3 dimension (n times, n components, n quantiles)  "
+                    "for time dependent metrics (e.g. `ae()`, ...)"
                 ),
                 logger=logger,
             )
-        elif len(vals.shape) == 1:
+        if len(vals.shape) == 2:
             vals = np.expand_dims(vals, TIME_AX)
 
         time_reduction = _get_reduction(
@@ -231,6 +355,7 @@ def wrapper_multivariate_support(*args, **kwargs) -> METRIC_OUTPUT_TYPE:
             sanity_check=False,
         )
         if time_reduction is not None:
+            # -> (1, n_comp, n_quantile)
             vals = np.expand_dims(time_reduction(vals, axis=TIME_AX), axis=TIME_AX)
 
         component_reduction = _get_reduction(
@@ -241,40 +366,71 @@ def wrapper_multivariate_support(*args, **kwargs) -> METRIC_OUTPUT_TYPE:
             sanity_check=False,
         )
         if component_reduction is not None:
-            vals = np.expand_dims(component_reduction(vals, axis=COMP_AX), axis=COMP_AX)
+            # -> (*, n_quantile)
+            vals = component_reduction(vals, axis=COMP_AX)
+        else:
+            # -> (*, n_comp * n_quantile), with order [c0_q0, c0_q1, ... c1_q0, c1_q1, ...]
+            vals = vals.reshape(vals.shape[0], -1)
         return vals
 
     return wrapper_multivariate_support
 
 
 def _get_values(
-    vals: np.ndarray, stochastic_quantile: Optional[float] = 0.5
+    vals: np.ndarray,
+    vals_components: pd.Index,
+    actual_components: pd.Index,
+    q: Optional[Tuple[Sequence[float], Union[Optional[pd.Index]]]] = None,
 ) -> np.ndarray:
     """
-    Returns a deterministic or probabilistic numpy array from the values of a time series.
-    For stochastic input values, return either all sample values with (stochastic_quantile=None) or the quantile sample
-    value with (stochastic_quantile {>=0,<=1})
+    Returns a deterministic or probabilistic numpy array from the values of a time series of shape
+    (times, components, samples / quantiles).
+    To extract quantile (sample) values from quantile or stachastic `vals`, use `q`.
+
+    Parameters
+    ----------
+    vals
+        A numpy array with the values of a TimeSeries (actual values or predictions).
+    vals_components
+        The components of the `vals` TimeSeries.
+    actual_components
+        The components of the actual TimeSeries.
+    q
+        Optionally, for stochastic or quantile series/values, return deterministic quantile values.
+        If not `None`, must a tuple with (quantile values,
+        `None` if `pred_series` is stochastic else the quantile component names).
     """
-    if vals.shape[2] == 1:  # deterministic
-        out = vals[:, :, 0]
-    else:  # stochastic
-        if stochastic_quantile is None:
-            out = vals
-        else:
-            out = np.quantile(vals, stochastic_quantile, axis=2)
-    return out
+    # return values as is (times, components, samples)
+    if q is None:
+        return vals
+
+    q, q_names = q
+    if vals.shape[SMPL_AX] == 1:  # deterministic (or quantile components) input
+        if q_names is not None:
+            # `q_names` are the component names of the predicted quantile parameters
+            # we extract the relevant quantile components with shape (times, components * quantiles)
+            vals = vals[:, vals_components.get_indexer(q_names)]
+            # rearrange into (times, components, quantiles)
+            vals = vals.reshape((len(vals), len(actual_components), -1))
+        return vals
+
+    # probabilistic input
+    # compute multiple quantiles for all times and components; with shape: (quantiles, times, components)
+    out = np.quantile(vals, q, axis=SMPL_AX)
+    # rearrange into (times, components, quantiles)
+    return out.transpose((1, 2, 0))
 
 
 def _get_values_or_raise(
     series_a: TimeSeries,
     series_b: TimeSeries,
     intersect: bool,
-    stochastic_quantile: Optional[float] = 0.5,
+    q: Optional[Tuple[Sequence[float], Union[Optional[pd.Index]]]] = None,
     remove_nan_union: bool = False,
     is_insample: bool = False,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Returns the processed numpy values of two time series. Processing can be customized with arguments
-    `intersect, stochastic_quantile, remove_nan_union`.
+    `intersect, q, remove_nan_union`.
 
     Parameters
     ----------
@@ -285,9 +441,10 @@ def _get_values_or_raise(
         A deterministic or stochastic ``TimeSeries`` instance (the predictions `pred_series`).
     intersect
         A boolean for whether to only consider the time intersection between `series_a` and `series_b`
-    stochastic_quantile
-        Optionally, for stochastic predicted series, return either all sample values with (`stochastic_quantile=None`)
-        or any deterministic quantile sample values by setting `stochastic_quantile=quantile` {>=0,<=1}.
+    q
+        Optionally, for predicted stochastic or quantile series, return deterministic quantile values.
+        If not `None`, must a tuple with (quantile values,
+        `None` if `pred_series` is stochastic else the quantile component names).
     remove_nan_union
         By setting `remove_non_union` to True, sets all values from `series_a` and `series_b` to `np.nan` at indices
         where any of the two series contain a NaN value. Only effective when `is_insample=False`.
@@ -299,16 +456,6 @@ def _get_values_or_raise(
     ValueError
         If `is_insample=False` and the two time series do not have at least a partially overlapping time index.
     """
-
-    if not series_a.width == series_b.width:
-        raise_log(
-            ValueError("The two time series must have the same number of components"),
-            logger=logger,
-        )
-
-    if not isinstance(intersect, bool):
-        raise_log(ValueError("The intersect parameter must be a bool"), logger=logger)
-
     make_copy = False
     if not is_insample:
         # get the time intersection and values of the two series (corresponds to `actual_series` and `pred_series`
@@ -319,15 +466,12 @@ def _get_values_or_raise(
             vals_a_common = series_a.slice_intersect_values(series_b, copy=make_copy)
             vals_b_common = series_b.slice_intersect_values(series_a, copy=make_copy)
 
-        if not len(vals_a_common) == len(vals_b_common):
-            raise_log(
-                ValueError(
-                    "The two time series must have at least a partially overlapping time index."
-                ),
-                logger=logger,
-            )
-
-        vals_b_det = _get_values(vals_b_common, stochastic_quantile=stochastic_quantile)
+        vals_b = _get_values(
+            vals=vals_b_common,
+            vals_components=series_b.components,
+            actual_components=series_a.components,
+            q=q,
+        )
     else:
         # for `insample` series we extract only values up until before start of `pred_series`
         # find how many steps `insample` overlaps into `series_b`
@@ -347,36 +491,67 @@ def _get_values_or_raise(
             )
         end = end or None
         vals_a_common = series_a.all_values(copy=make_copy)[:end]
-        vals_b_det = None
-    vals_a_det = _get_values(vals_a_common, stochastic_quantile=stochastic_quantile)
+        vals_b = None
+    vals_a = _get_values(
+        vals=vals_a_common,
+        vals_components=series_a.components,
+        actual_components=series_a.components,
+        q=([0.5], None),
+    )
 
     if not remove_nan_union or is_insample:
-        return vals_a_det, vals_b_det
+        return vals_a, vals_b
 
-    b_is_deterministic = bool(len(vals_b_det.shape) == 2)
-    if b_is_deterministic:
-        isnan_mask = np.logical_or(np.isnan(vals_a_det), np.isnan(vals_b_det))
-        isnan_mask_pred = isnan_mask
-    else:
-        isnan_mask = np.logical_or(
-            np.isnan(vals_a_det), np.isnan(vals_b_det).any(axis=2)
-        )
-        isnan_mask_pred = np.repeat(
-            np.expand_dims(isnan_mask, axis=-1), vals_b_det.shape[2], axis=2
-        )
-    return np.where(isnan_mask, np.nan, vals_a_det), np.where(
-        isnan_mask_pred, np.nan, vals_b_det
+    isnan_mask = np.expand_dims(
+        np.logical_or(np.isnan(vals_a), np.isnan(vals_b)).any(axis=SMPL_AX), axis=-1
+    )
+    isnan_mask_pred = np.repeat(isnan_mask, vals_b.shape[SMPL_AX], axis=SMPL_AX)
+    return np.where(isnan_mask, np.nan, vals_a), np.where(
+        isnan_mask_pred, np.nan, vals_b
     )
 
 
+def _get_quantile_intervals(
+    vals: np.ndarray,
+    q: Tuple[Sequence[float], Any],
+    q_interval: np.ndarray = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Returns the lower and upper bound values from `vals` for all quantile intervals in `q_interval`.
+
+    Parameters
+    ----------
+    vals
+        A numpy array with predicted quantile values of shape (n times, n components, n quantiles).
+    q
+        A tuple with (quantile values, any).
+    q_interval
+        A numpy array with the lower and upper quantile interval bound of shape (n intervals, 2).
+    """
+    q, _ = q
+    # find index of every `q_interval` value in `q`; we have guarantees from support wrappers:
+    # - `q` has increasing order
+    # - `vals` has same order as `q` in dim 3 (quantile dim)
+    # - `q_interval` holds (lower q, upper q) in that order
+    q_idx = np.searchsorted(q, q_interval.flatten()).reshape(q_interval.shape)
+    return vals[:, :, q_idx[:, 0]], vals[:, :, q_idx[:, 1]]
+
+
 def _get_wrapped_metric(
-    func: Callable[..., METRIC_OUTPUT_TYPE],
+    func: Callable[..., METRIC_OUTPUT_TYPE], n_wrappers: int = 2
 ) -> Callable[..., METRIC_OUTPUT_TYPE]:
     """Returns the inner metric function `func` which bypasses the decorators `multi_ts_support` and
     `multivariate_support`. It significantly decreases process time compared to calling `func` directly.
     Only use this to compute a pre-defined metric within the scope of another metric.
     """
-    return func.__wrapped__.__wrapped__
+    if not 2 <= n_wrappers <= 3:
+        raise_log(
+            NotImplementedError("Only 2-3 wrappers are currently supported"),
+            logger=logger,
+        )
+    if n_wrappers == 2:
+        return func.__wrapped__.__wrapped__
+    else:
+        return func.__wrapped__.__wrapped__.__wrapped__
 
 
 def _get_reduction(
@@ -471,6 +646,7 @@ def err(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -484,8 +660,9 @@ def err(
 
     .. math:: y_t - \\hat{y}_t
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -496,6 +673,8 @@ def err(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -543,7 +722,11 @@ def err(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=False
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=False,
+        q=q,
     )
     return y_true - y_pred
 
@@ -555,6 +738,7 @@ def merr(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -567,8 +751,9 @@ def merr(
 
     .. math:: \\frac{1}{T}\\sum_{t=1}^T{(y_t - \\hat{y}_t)}
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -579,6 +764,8 @@ def merr(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -620,6 +807,7 @@ def merr(
             actual_series,
             pred_series,
             intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -632,6 +820,7 @@ def ae(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -645,8 +834,9 @@ def ae(
 
     .. math:: |y_t - \\hat{y}_t|
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -657,6 +847,8 @@ def ae(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -704,7 +896,11 @@ def ae(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=False
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=False,
+        q=q,
     )
     return np.abs(y_true - y_pred)
 
@@ -716,6 +912,7 @@ def mae(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -728,8 +925,9 @@ def mae(
 
     .. math:: \\frac{1}{T}\\sum_{t=1}^T{|y_t - \\hat{y}_t|}
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -740,6 +938,8 @@ def mae(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -781,6 +981,7 @@ def mae(
             actual_series,
             pred_series,
             intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -795,6 +996,7 @@ def ase(
     m: int = 1,
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -816,8 +1018,9 @@ def ase(
 
     .. math:: E_m = MAE(y_{m:t_p}, y_{0:t_p - m}).
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -835,6 +1038,8 @@ def ase(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -895,6 +1100,7 @@ def ase(
         actual_series,
         pred_series,
         intersect,
+        q=q,
     )
     return errors / error_scale
 
@@ -908,6 +1114,7 @@ def mase(
     m: int = 1,
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -928,8 +1135,9 @@ def mase(
 
     .. math:: E_m = MAE(y_{m:t_p}, y_{0:t_p - m}).
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -947,6 +1155,8 @@ def mase(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -1000,6 +1210,7 @@ def mase(
             insample,
             m=m,
             intersect=intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -1012,6 +1223,7 @@ def se(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -1025,8 +1237,9 @@ def se(
 
     .. math:: (y_t - \\hat{y}_t)^2.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1037,6 +1250,8 @@ def se(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -1084,7 +1299,11 @@ def se(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=False
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=False,
+        q=q,
     )
     return (y_true - y_pred) ** 2
 
@@ -1096,6 +1315,7 @@ def mse(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -1108,8 +1328,9 @@ def mse(
 
     .. math:: \\frac{1}{T}\\sum_{t=1}^T{(y_t - \\hat{y}_t)^2}.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1120,6 +1341,8 @@ def mse(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -1161,6 +1384,7 @@ def mse(
             actual_series,
             pred_series,
             intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -1175,6 +1399,7 @@ def sse(
     m: int = 1,
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -1196,8 +1421,9 @@ def sse(
 
     .. math:: E_m = MSE(y_{m:t_p}, y_{0:t_p - m}).
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1215,6 +1441,8 @@ def sse(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -1275,6 +1503,7 @@ def sse(
         actual_series,
         pred_series,
         intersect,
+        q=q,
     )
     return errors / error_scale
 
@@ -1288,6 +1517,7 @@ def msse(
     m: int = 1,
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -1308,8 +1538,9 @@ def msse(
 
     .. math:: E_m = MSE(y_{m:t_p}, y_{0:t_p - m}).
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1327,6 +1558,8 @@ def msse(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -1380,6 +1613,7 @@ def msse(
             insample,
             m=m,
             intersect=intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -1392,6 +1626,7 @@ def rmse(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -1404,8 +1639,9 @@ def rmse(
 
     .. math:: \\sqrt{\\frac{1}{T}\\sum_{t=1}^T{(y_t - \\hat{y}_t)^2}}
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1416,6 +1652,8 @@ def rmse(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -1457,6 +1695,7 @@ def rmse(
             actual_series,
             pred_series,
             intersect,
+            q=q,
         )
     )
 
@@ -1470,6 +1709,7 @@ def rmsse(
     m: int = 1,
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -1490,8 +1730,9 @@ def rmsse(
 
     .. math:: E_m = RMSE(y_{m:t_p}, y_{0:t_p - m}).
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1509,6 +1750,8 @@ def rmsse(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -1560,6 +1803,7 @@ def rmsse(
         actual_series,
         pred_series,
         intersect,
+        q=q,
     )
     return errors / error_scale
 
@@ -1571,6 +1815,7 @@ def sle(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -1586,8 +1831,9 @@ def sle(
 
     using the natural logarithm.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1598,6 +1844,8 @@ def sle(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -1645,7 +1893,11 @@ def sle(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=False
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=False,
+        q=q,
     )
     y_true, y_pred = np.log(y_true + 1), np.log(y_pred + 1)
     return (y_true - y_pred) ** 2
@@ -1658,6 +1910,7 @@ def rmsle(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -1672,8 +1925,9 @@ def rmsle(
 
     using the natural logarithm.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1684,6 +1938,8 @@ def rmsle(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -1726,6 +1982,7 @@ def rmsle(
                 actual_series,
                 pred_series,
                 intersect,
+                q=q,
             ),
             axis=TIME_AX,
         )
@@ -1739,6 +1996,7 @@ def ape(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -1755,8 +2013,9 @@ def ape(
     Note that it will raise a `ValueError` if :math:`y_t = 0` for some :math:`t`. Consider using
     the Absolute Scaled Error (:func:`~darts.metrics.metrics.ase`) in these cases.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1767,6 +2026,8 @@ def ape(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -1819,7 +2080,11 @@ def ape(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=False
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=False,
+        q=q,
     )
     if not (y_true != 0).all():
         raise_log(
@@ -1838,6 +2103,7 @@ def mape(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -1853,8 +2119,9 @@ def mape(
     Note that it will raise a `ValueError` if :math:`y_t = 0` for some :math:`t`. Consider using
     the Mean Absolute Scaled Error (:func:`~darts.metrics.metrics.mase`) in these cases.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1865,6 +2132,8 @@ def mape(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -1912,6 +2181,7 @@ def mape(
             actual_series,
             pred_series,
             intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -1924,6 +2194,7 @@ def sape(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -1941,8 +2212,9 @@ def sape(
     Note that it will raise a `ValueError` if :math:`\\left| y_t \\right| + \\left| \\hat{y}_t \\right| = 0` for some
     :math:`t`. Consider using the Absolute Scaled Error (:func:`~darts.metrics.metrics.ase`)  in these cases.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -1953,6 +2225,8 @@ def sape(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -2005,7 +2279,11 @@ def sape(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=True
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=True,
+        q=q,
     )
     if not np.logical_or(y_true != 0, y_pred != 0).all():
         raise_log(
@@ -2024,6 +2302,7 @@ def smape(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -2042,8 +2321,9 @@ def smape(
     for some :math:`t`. Consider using the Mean Absolute Scaled Error (:func:`~darts.metrics.metrics.mase`) in these
     cases.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -2054,6 +2334,8 @@ def smape(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2101,6 +2383,7 @@ def smape(
             actual_series,
             pred_series,
             intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -2113,6 +2396,7 @@ def ope(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -2126,8 +2410,9 @@ def ope(
     .. math:: 100 \\cdot \\left| \\frac{\\sum_{t=1}^{T}{y_t}
               - \\sum_{t=1}^{T}{\\hat{y}_t}}{\\sum_{t=1}^{T}{y_t}} \\right|.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -2138,6 +2423,8 @@ def ope(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2181,7 +2468,11 @@ def ope(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=True
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=True,
+        q=q,
     )
     y_true_sum, y_pred_sum = (
         np.nansum(y_true, axis=TIME_AX),
@@ -2204,6 +2495,7 @@ def arre(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -2217,8 +2509,9 @@ def arre(
 
     .. math:: 100 \\cdot \\left| \\frac{y_t - \\hat{y}_t} {\\max_t{y_t} - \\min_t{y_t}} \\right|
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -2229,6 +2522,8 @@ def arre(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     time_reduction
         Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
@@ -2281,7 +2576,11 @@ def arre(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=True
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=True,
+        q=q,
     )
     y_max, y_min = np.nanmax(y_true, axis=TIME_AX), np.nanmin(y_true, axis=TIME_AX)
     if not (y_max > y_min).all():
@@ -2303,6 +2602,7 @@ def marre(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -2316,8 +2616,9 @@ def marre(
     .. math:: 100 \\cdot \\frac{1}{T} \\sum_{t=1}^{T} {\\left| \\frac{y_t - \\hat{y}_t} {\\max_t{y_t} -
               \\min_t{y_t}} \\right|}
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -2328,6 +2629,8 @@ def marre(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2372,6 +2675,7 @@ def marre(
             actual_series,
             pred_series,
             intersect,
+            q=q,
         ),
         axis=TIME_AX,
     )
@@ -2384,6 +2688,7 @@ def r2_score(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -2400,8 +2705,9 @@ def r2_score(
 
     This metric is not symmetric.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -2412,6 +2718,8 @@ def r2_score(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2453,7 +2761,11 @@ def r2_score(
     .. [1] https://en.wikipedia.org/wiki/Coefficient_of_determination
     """
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=True
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=True,
+        q=q,
     )
     ss_errors = np.nansum((y_true - y_pred) ** 2, axis=TIME_AX)
     y_hat = np.nanmean(y_true, axis=TIME_AX)
@@ -2468,6 +2780,7 @@ def coefficient_of_variation(
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
     intersect: bool = True,
     *,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -2483,8 +2796,9 @@ def coefficient_of_variation(
     where :math:`RMSE` is the Root Mean Squared Error (:func:`~darts.metrics.metrics.rmse`), and :math:`\\bar{y}` is
     the average of :math:`y` over all time steps.
 
-    If any of the series is stochastic (containing several samples), :math:`\\hat{y}_t` is the median over all samples
-    for time step :math:`t`.
+    If :math:`\\hat{y}_t` are stochastic (contains several samples) or quantile predictions, use parameter `q` to
+    specify on which quantile(s) to compute the metric on. By default, it uses the median 0.5 quantile
+    (over all samples, or, if given, the quantile prediction itself).
 
     Parameters
     ----------
@@ -2495,6 +2809,8 @@ def coefficient_of_variation(
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2533,7 +2849,11 @@ def coefficient_of_variation(
     """
 
     y_true, y_pred = _get_values_or_raise(
-        actual_series, pred_series, intersect, remove_nan_union=True
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=True,
+        q=q,
     )
     # not calling rmse as y_true and y_pred are np.ndarray
     return (
@@ -2629,9 +2949,9 @@ def dtw_metric(
 def qr(
     actual_series: Union[TimeSeries, Sequence[TimeSeries]],
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
-    q: float = 0.5,
     intersect: bool = True,
     *,
+    q: Union[float, List[float], Tuple[np.ndarray, pd.Index]] = 0.5,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -2660,11 +2980,11 @@ def qr(
         The (sequence of) actual series.
     pred_series
         The (sequence of) predicted series.
-    q
-        The quantile (float [0, 1]) of interest for the risk evaluation.
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        The quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2713,18 +3033,19 @@ def qr(
         actual_series,
         pred_series,
         intersect,
-        stochastic_quantile=None,
+        q=None,
         remove_nan_union=True,
     )
     z_true = np.nansum(z_true, axis=TIME_AX)
     z_hat = np.nansum(
         z_hat, axis=TIME_AX
     )  # aggregate all individual sample realizations
+    # quantile loss
+    q, _ = q
     z_hat_rho = np.quantile(
         z_hat, q=q, axis=1
-    )  # get the quantile from aggregated samples
+    ).T  # get the quantile from aggregated samples
 
-    # quantile loss
     errors = z_true - z_hat_rho
     losses = 2 * np.maximum((q - 1) * errors, q * errors)
     return losses / z_true
@@ -2735,9 +3056,9 @@ def qr(
 def ql(
     actual_series: Union[TimeSeries, Sequence[TimeSeries]],
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
-    q: float = 0.5,
     intersect: bool = True,
     *,
+    q: Union[float, List[float], Tuple[np.ndarray, pd.Index]] = 0.5,
     time_reduction: Optional[Callable[..., np.ndarray]] = None,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
@@ -2747,7 +3068,8 @@ def ql(
     """Quantile Loss (QL).
 
     Also known as Pinball Loss. QL is a metric that quantifies the accuracy of a specific quantile :math:`q` from the
-    predicted value distribution of a stochastic/probabilistic `pred_series` containing N samples.
+    predicted deterministic quantiles or value distribution of a stochastic/probabilistic `pred_series` containing N
+    samples.
 
     QL computes the quantile of all sample values and the loss per time step.
 
@@ -2756,7 +3078,7 @@ def ql(
 
     .. math:: 2 \\max((q - 1) (y_t - \\hat{y}_{t,q}), q (y_t - \\hat{y}_{t,q})),
 
-    where :math:`\\hat{y}_{t,q}` is the quantile :math:`q` of all predicted sample values at time :math:`t`.
+    where :math:`\\hat{y}_{t,q}` is quantile value :math:`q` (of all predicted quantiles or samples) at time :math:`t`.
     The factor `2` makes the loss more interpretable, as for `q=0.5` the loss is identical to the Absolute Error
     (:func:`~darts.metrics.metrics.ae`).
 
@@ -2766,11 +3088,11 @@ def ql(
         The (sequence of) actual series.
     pred_series
         The (sequence of) predicted series.
-    q
-        The quantile (float [0, 1]) of interest for the loss.
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        The quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2816,22 +3138,14 @@ def ql(
     List[np.ndarray]
         Same as for type `np.ndarray` but for a sequence of series.
     """
-    if not pred_series.is_stochastic:
-        raise_log(
-            ValueError(
-                "quantile/pinball loss (ql) should only be computed for "
-                "stochastic predicted TimeSeries."
-            ),
-            logger=logger,
-        )
-
     y_true, y_pred = _get_values_or_raise(
         actual_series,
         pred_series,
         intersect,
-        stochastic_quantile=q,
+        q=q,
         remove_nan_union=True,
     )
+    q, _ = q
     errors = y_true - y_pred
     losses = 2.0 * np.maximum((q - 1) * errors, q * errors)
     return losses
@@ -2842,9 +3156,9 @@ def ql(
 def mql(
     actual_series: Union[TimeSeries, Sequence[TimeSeries]],
     pred_series: Union[TimeSeries, Sequence[TimeSeries]],
-    q: float = 0.5,
     intersect: bool = True,
     *,
+    q: Union[float, List[float], Tuple[np.ndarray, pd.Index]] = 0.5,
     component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
     series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
     n_jobs: int = 1,
@@ -2853,7 +3167,8 @@ def mql(
     """Mean Quantile Loss (MQL).
 
     Also known as Pinball Loss. QL is a metric that quantifies the accuracy of a specific quantile :math:`q` from the
-    predicted value distribution of a stochastic/probabilistic `pred_series` containing N samples.
+    predicted deterministic quantiles or value distribution of a stochastic/probabilistic `pred_series` containing N
+    samples.
 
     MQL first computes the quantile of all sample values and the loss per time step, and then takes the mean over the
     time axis.
@@ -2863,7 +3178,7 @@ def mql(
 
     .. math:: 2 \\frac{1}{T}\\sum_{t=1}^T{\\max((q - 1) (y_t - \\hat{y}_{t,q}), q (y_t - \\hat{y}_{t,q}))},
 
-    where :math:`\\hat{y}_{t,q}` is the quantile :math:`q` of all predicted sample values at time :math:`t`.
+    where :math:`\\hat{y}_{t,q}` is quantile value :math:`q` (of all predicted quantiles or samples) at time :math:`t`.
     The factor `2` makes the loss more interpretable, as for `q=0.5` the loss is identical to the Mean Absolute Error
     (:func:`~darts.metrics.metrics.mae`).
 
@@ -2873,11 +3188,11 @@ def mql(
         The (sequence of) actual series.
     pred_series
         The (sequence of) predicted series.
-    q
-        The quantile (float [0, 1]) of interest for the loss.
     intersect
         For time series that are overlapping in time without having the same time index, setting `True`
         will consider the values only over their common time interval (intersection in time).
+    q
+        The quantile (float [0, 1]) or list of quantiles of interest to compute the metric on.
     component_reduction
         Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
         of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
@@ -2923,3 +3238,192 @@ def mql(
         ),
         axis=TIME_AX,
     )
+
+
+@interval_support
+@multi_ts_support
+@multivariate_support
+def iw(
+    actual_series: Union[TimeSeries, Sequence[TimeSeries]],
+    pred_series: Union[TimeSeries, Sequence[TimeSeries]],
+    intersect: bool = True,
+    *,
+    q_interval: Union[Tuple[float, float], Sequence[Tuple[float, float]]] = None,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
+    time_reduction: Optional[Callable[..., np.ndarray]] = None,
+    component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
+    series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
+    n_jobs: int = 1,
+    verbose: bool = False,
+) -> METRIC_OUTPUT_TYPE:
+    """Interval Width (IL).
+
+    IL gives the width of predicted quantile intervals.
+
+    For the true series :math:`y` and predicted stochastic or quantile series :math:`\\hat{y}` of length :math:`T`,
+    it is computed per component/column, quantile interval, and time step
+    :math:`t` as:
+
+    .. math:: \\hat{y}_{t,qh} - \\hat{y}_{t,ql}
+
+    where :math:`\\hat{y}_{t,qh}` are the upper bound quantile values (of all predicted quantiles or samples) at time
+    :math:`t`, and :math:`\\hat{y}_{t,ql}` are the lower bound quantile values.
+
+    Parameters
+    ----------
+    actual_series
+        The (sequence of) actual series.
+    pred_series
+        The (sequence of) predicted series.
+    intersect
+        For time series that are overlapping in time without having the same time index, setting `True`
+        will consider the values only over their common time interval (intersection in time).
+    q_interval
+        The quantile interval(s) to compute the metric on. Must be a tuple (single interval) or sequence tuples
+        (multiple intervals) with elements (low quantile, high quantile).
+    q
+        Quantiles `q` not supported by this metric; use `q_interval` instead.
+    component_reduction
+        Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
+        of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
+        parameter named `axis`, and returns the reduced array. The `axis` receives value `1` corresponding to the
+        component axis. If `None`, will return a metric per component.
+    time_reduction
+        Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray`
+        of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a
+        parameter named `axis`, and returns the reduced array. The `axis` receives value `0` corresponding to the
+        time axis. If `None`, will return a metric per time step.
+    series_reduction
+        Optionally, a function to aggregate the metrics over multiple series. It must reduce a `np.ndarray`
+        of shape `(s, t, c)` to a `np.ndarray` of shape `(t, c)` The function takes as input a ``np.ndarray`` and a
+        parameter named `axis`, and returns the reduced array. The `axis` receives value `0` corresponding to the
+        series axis. For example with `np.nanmean`, will return the average over all series metrics. If `None`, will
+        return a metric per component.
+    n_jobs
+        The number of jobs to run in parallel. Parallel jobs are created only when a ``Sequence[TimeSeries]`` is
+        passed as input, parallelising operations regarding different ``TimeSeries``. Defaults to `1`
+        (sequential). Setting the parameter to `-1` means using all the available processors.
+    verbose
+        Optionally, whether to print operations progress
+
+    Returns
+    -------
+    float
+        A single metric score for:
+
+        - single univariate series.
+        - single multivariate series with `component_reduction`.
+        - a sequence (list) of uni/multivariate series with `series_reduction`, `component_reduction` and
+          `time_reduction`.
+    np.ndarray
+        A numpy array of metric scores. The array has shape (n time steps, n components) without time
+        and component reductions. For:
+
+        - single multivariate series and at least `component_reduction=None`.
+        - single uni/multivariate series and at least `time_reduction=None`.
+        - a sequence of uni/multivariate series including `series_reduction` and at least one of
+          `component_reduction=None` or `time_reduction=None`.
+    List[float]
+        Same as for type `float` but for a sequence of series.
+    List[np.ndarray]
+        Same as for type `np.ndarray` but for a sequence of series.
+    """
+    y_true, y_pred = _get_values_or_raise(
+        actual_series,
+        pred_series,
+        intersect,
+        remove_nan_union=True,
+        q=q,
+    )
+    y_pred_lo, y_pred_hi = _get_quantile_intervals(y_pred, q=q, q_interval=q_interval)
+    return y_pred_hi - y_pred_lo
+
+
+@interval_support
+@multi_ts_support
+@multivariate_support
+def miw(
+    actual_series: Union[TimeSeries, Sequence[TimeSeries]],
+    pred_series: Union[TimeSeries, Sequence[TimeSeries]],
+    intersect: bool = True,
+    *,
+    q_interval: Union[Tuple[float, float], Sequence[Tuple[float, float]]] = None,
+    q: Optional[Union[float, List[float], Tuple[np.ndarray, pd.Index]]] = None,
+    component_reduction: Optional[Callable[[np.ndarray], float]] = np.nanmean,
+    series_reduction: Optional[Callable[[np.ndarray], Union[float, np.ndarray]]] = None,
+    n_jobs: int = 1,
+    verbose: bool = False,
+) -> METRIC_OUTPUT_TYPE:
+    """Mean Interval Width (IL).
+
+    IL gives the width of predicted quantile intervals aggregated over time.
+
+    For the true series :math:`y` and predicted stochastic or quantile series :math:`\\hat{y}` of length :math:`T`,
+    it is computed per component/column, quantile interval, and time step
+    :math:`t` as:
+
+    .. math:: \\frac{1}{T}\\sum_{t=1}^T{\\hat{y}_{t,qh} - \\hat{y}_{t,ql}}
+
+    where :math:`\\hat{y}_{t,qh}` are the upper bound quantile values (of all predicted quantiles or samples) at time
+    :math:`t`, and :math:`\\hat{y}_{t,ql}` are the lower bound quantile values.
+
+    Parameters
+    ----------
+    actual_series
+        The (sequence of) actual series.
+    pred_series
+        The (sequence of) predicted series.
+    intersect
+        For time series that are overlapping in time without having the same time index, setting `True`
+        will consider the values only over their common time interval (intersection in time).
+    q_interval
+        The quantile interval(s) to compute the metric on. Must be a tuple (single interval) or sequence tuples
+        (multiple intervals) with elements (low quantile, high quantile).
+    q
+        Quantiles `q` not supported by this metric; use `q_interval` instead.
+    component_reduction
+        Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray`
+        of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a
+        parameter named `axis`, and returns the reduced array. The `axis` receives value `1` corresponding to the
+        component axis. If `None`, will return a metric per component.
+    series_reduction
+        Optionally, a function to aggregate the metrics over multiple series. It must reduce a `np.ndarray`
+        of shape `(s, t, c)` to a `np.ndarray` of shape `(t, c)` The function takes as input a ``np.ndarray`` and a
+        parameter named `axis`, and returns the reduced array. The `axis` receives value `0` corresponding to the
+        series axis. For example with `np.nanmean`, will return the average over all series metrics. If `None`, will
+        return a metric per component.
+    n_jobs
+        The number of jobs to run in parallel. Parallel jobs are created only when a ``Sequence[TimeSeries]`` is
+        passed as input, parallelising operations regarding different ``TimeSeries``. Defaults to `1`
+        (sequential). Setting the parameter to `-1` means using all the available processors.
+    verbose
+        Optionally, whether to print operations progress
+
+    Returns
+    -------
+    float
+        A single metric score for:
+
+        - single univariate series.
+        - single multivariate series with `component_reduction`.
+        - sequence (list) of uni/multivariate series with `series_reduction` and `component_reduction`.
+    np.ndarray
+        A numpy array of metric scores. The array has shape (n components,) without component reduction. For:
+
+        - single multivariate series and at least `component_reduction=None`.
+        - sequence of uni/multivariate series including `series_reduction` and `component_reduction=None`.
+    List[float]
+        Same as for type `float` but for a sequence of series.
+    List[np.ndarray]
+        Same as for type `np.ndarray` but for a sequence of series.
+    """
+    return np.nanmean(
+        _get_wrapped_metric(iw, n_wrappers=3)(
+            actual_series,
+            pred_series,
+            intersect,
+            q=q,
+            q_interval=q_interval,
+        ),
+        axis=TIME_AX,
+    )
diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py
index 5a5dc7a738..ee271ceed6 100644
--- a/darts/models/forecasting/forecasting_model.py
+++ b/darts/models/forecasting/forecasting_model.py
@@ -57,7 +57,12 @@
     get_single_series,
     series2seq,
 )
-from darts.utils.utils import generate_index
+from darts.utils.utils import (
+    generate_index,
+    likelihood_component_names,
+    quantile_interval_names,
+    quantile_names,
+)
 
 logger = get_logger(__name__)
 
@@ -754,6 +759,7 @@ def historical_forecasts(
             Default: ``False``
         enable_optimization
             Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
         fit_kwargs
             Additional arguments passed to the model `fit()` method.
         predict_kwargs
@@ -1193,6 +1199,8 @@ def backtest(
         reduction: Union[Callable[..., float], None] = np.mean,
         verbose: bool = False,
         show_warnings: bool = True,
+        predict_likelihood_parameters: bool = False,
+        enable_optimization: bool = True,
         metric_kwargs: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
         fit_kwargs: Optional[Dict[str, Any]] = None,
         predict_kwargs: Optional[Dict[str, Any]] = None,
@@ -1312,6 +1320,13 @@ def backtest(
             Whether to print progress.
         show_warnings
             Whether to show warnings related to parameters `start`, and `train_length`.
+        predict_likelihood_parameters
+            If set to `True`, the model predict the parameters of its Likelihood parameters instead of the target. Only
+            supported for probabilistic models with `likelihood="quantile"`, `num_samples = 1` and
+            `n<=output_chunk_length`. Default: ``False``.
+        enable_optimization
+            Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
         metric_kwargs
             Additional arguments passed to `metric()`, such as `'n_jobs'` for parallelization, `'component_reduction'`
             for reducing the component wise metrics, seasonality `'m'` for scaled metrics, etc. Will pass arguments to
@@ -1343,9 +1358,9 @@ def backtest(
             An numpy array of backtest scores. For single series and one of:
 
             - a single `metric` function, `historical_forecasts` generated with `last_points_only=False`
-              and backtest `reduction=None`. The output has shape (n forecasts,).
+              and backtest `reduction=None`. The output has shape (n forecasts, *).
             - multiple `metric` functions and `historical_forecasts` generated with `last_points_only=False`.
-              The output has shape (n metrics,) when using a backtest `reduction`, and (n metrics, n forecasts)
+              The output has shape (*, n metrics) when using a backtest `reduction`, and (n forecasts, *, n metrics)
               when `reduction=None`
             - multiple uni/multivariate series including `series_reduction` and at least one of
               `component_reduction=None` or `time_reduction=None` for "per time step metrics"
@@ -1391,6 +1406,8 @@ def backtest(
             last_points_only=last_points_only,
             verbose=verbose,
             show_warnings=show_warnings,
+            predict_likelihood_parameters=predict_likelihood_parameters,
+            enable_optimization=enable_optimization,
             fit_kwargs=fit_kwargs,
             predict_kwargs=predict_kwargs,
             sample_weight=sample_weight,
@@ -1491,22 +1508,24 @@ def __getitem__(self, index) -> TimeSeries:
         # get errors for each input `series`
         backtest_list = []
         for i in range(len(cum_len) - 1):
-            # errors_series with shape `(n metrics, n series specific historical forecasts)`
+            # errors_series with shape `(n metrics, n series specific historical forecasts, *)`
             errors_series = errors[:, cum_len[i] : cum_len[i + 1]]
 
             if reduction is not None:
-                # shape `(n metrics, n forecasts)` -> `(n metrics,)`
+                # shape `(n metrics, n forecasts, *)` -> `(n metrics, *)`
                 errors_series = reduction(errors_series, axis=1)
             elif last_points_only:
-                # shape `(n metrics, n forecasts = 1)` -> `(n metrics,)`
+                # shape `(n metrics, n forecasts = 1, *)` -> `(n metrics, *)`
                 errors_series = errors_series[:, 0]
 
             if len(metric) == 1:
                 # shape `(n metrics, *)` -> `(*,)`
                 errors_series = errors_series[0]
-            elif not last_points_only and reduction is None:
+            else:
                 # shape `(n metrics, *)` -> `(*, n metrics)`
-                errors_series = errors_series.T
+                errors_series = errors_series.transpose(
+                    tuple(i for i in range(1, errors_series.ndim)) + (0,)
+                )
 
             backtest_list.append(errors_series)
         return (
@@ -1831,6 +1850,8 @@ def residuals(
         metric: METRIC_TYPE = metrics.err,
         verbose: bool = False,
         show_warnings: bool = True,
+        predict_likelihood_parameters: bool = False,
+        enable_optimization: bool = True,
         metric_kwargs: Optional[Dict[str, Any]] = None,
         fit_kwargs: Optional[Dict[str, Any]] = None,
         predict_kwargs: Optional[Dict[str, Any]] = None,
@@ -1949,6 +1970,13 @@ def residuals(
             Whether to print progress.
         show_warnings
             Whether to show warnings related to parameters `start`, and `train_length`.
+        predict_likelihood_parameters
+            If set to `True`, the model predict the parameters of its Likelihood parameters instead of the target. Only
+            supported for probabilistic models with `likelihood="quantile"`, `num_samples = 1` and
+            `n<=output_chunk_length`. Default: ``False``.
+        enable_optimization
+            Whether to use the optimized version of historical_forecasts when supported and available.
+            Default: ``True``.
         metric_kwargs
             Additional arguments passed to `metric()`, such as `'n_jobs'` for parallelization, `'m'` for scaled
             metrics, etc. Will pass arguments only if they are present in the corresponding metric signature. Ignores
@@ -2004,6 +2032,8 @@ def residuals(
             last_points_only=last_points_only,
             verbose=verbose,
             show_warnings=show_warnings,
+            predict_likelihood_parameters=predict_likelihood_parameters,
+            enable_optimization=enable_optimization,
             fit_kwargs=fit_kwargs,
             predict_kwargs=predict_kwargs,
             overlap_end=False,
@@ -2035,9 +2065,10 @@ def residuals(
             residuals = [[res] for res in residuals]
 
         # sanity check residual output
+        q, q_interval = metric_kwargs.get("q"), metric_kwargs.get("q_interval")
         try:
-            res, fc = residuals[0][0], historical_forecasts[0][0]
-            _ = np.reshape(res, (len(fc), fc.n_components, 1))
+            series_, res, fc = series[0], residuals[0][0], historical_forecasts[0][0]
+            _ = np.reshape(res, (len(fc), -1, 1))
         except Exception as err:
             raise_log(
                 ValueError(
@@ -2051,13 +2082,47 @@ def residuals(
 
         # process residuals
         residuals_out = []
-        for fc_list, res_list in zip(historical_forecasts, residuals):
+        for series_, fc_list, res_list in zip(series, historical_forecasts, residuals):
             res_list_out = []
+            if q is not None:
+                q = [q] if isinstance(q, float) else q
+                # multi-quantile metrics yield more components
+                comp_names = likelihood_component_names(
+                    components=series_.components,
+                    parameter_names=quantile_names(q),
+                )
+            # `q` and `q_interval` are mutually exclusive
+            elif q_interval is not None:
+                # multi-quantile metrics yield more components
+                q_interval = (
+                    [q_interval] if isinstance(q_interval, tuple) else q_interval
+                )
+                comp_names = likelihood_component_names(
+                    components=series_.components,
+                    parameter_names=quantile_interval_names(q_interval),
+                )
+            else:
+                comp_names = None
             for fc, res in zip(fc_list, res_list):
-                # make sure all residuals have shape (n time steps, n components, n samples=1)
+                # make sure all residuals have shape (n time steps, n components * n quantiles, n samples=1)
                 if len(res.shape) != 3:
-                    res = np.reshape(res, (len(fc), fc.n_components, 1))
-                res_list_out.append(res if values_only else fc.with_values(res))
+                    res = np.reshape(res, (len(fc), -1, 1))
+                if values_only:
+                    res = res
+                elif (q is None and q_interval is None) and res.shape[
+                    1
+                ] == fc.n_components:
+                    res = fc.with_values(res)
+                else:
+                    # quantile (interval) metrics created different number of components;
+                    # create new series with unknown components
+                    res = TimeSeries.from_times_and_values(
+                        times=fc._time_index,
+                        values=res,
+                        columns=comp_names,
+                    )
+                res_list_out.append(res)
+
             residuals_out.append(res_list_out)
 
         # if required, reduce to `series` input type
diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py
index d22a23329e..4ca5b68ffe 100644
--- a/darts/models/forecasting/regression_model.py
+++ b/darts/models/forecasting/regression_model.py
@@ -56,7 +56,11 @@
 )
 from darts.utils.multioutput import MultiOutputRegressor
 from darts.utils.ts_utils import get_single_series, seq2series, series2seq
-from darts.utils.utils import _check_quantiles
+from darts.utils.utils import (
+    _check_quantiles,
+    likelihood_component_names,
+    quantile_names,
+)
 
 logger = get_logger(__name__)
 
@@ -1668,17 +1672,15 @@ def _quantiles_generate_components_names(
     ) -> List[str]:
         return self._likelihood_generate_components_names(
             input_series,
-            [f"q{quantile:.2f}" for quantile in self._model_container.keys()],
+            quantile_names(q=self._model_container.keys()),
         )
 
     def _likelihood_generate_components_names(
         self, input_series: TimeSeries, parameter_names: List[str]
     ) -> List[str]:
-        return [
-            f"{tgt_name}_{param_n}"
-            for tgt_name in input_series.components
-            for param_n in parameter_names
-        ]
+        return likelihood_component_names(
+            components=input_series.components, parameter_names=parameter_names
+        )
 
 
 class _QuantileModelContainer(OrderedDict):
diff --git a/darts/tests/metrics/test_metrics.py b/darts/tests/metrics/test_metrics.py
index 40754c0bfe..df7a820bf5 100644
--- a/darts/tests/metrics/test_metrics.py
+++ b/darts/tests/metrics/test_metrics.py
@@ -7,8 +7,9 @@
 import pytest
 import sklearn.metrics
 
-from darts import TimeSeries
+from darts import TimeSeries, concatenate
 from darts.metrics import metrics
+from darts.utils.utils import likelihood_component_names, quantile_names
 
 
 def sklearn_mape(*args, **kwargs):
@@ -65,6 +66,19 @@ def metric_rmsle(y_true, y_pred, **kwargs):
     )
 
 
+def metric_iw(y_true, y_pred, q_interval=None, **kwargs):
+    # this tests assumes `y_pred` are stochastic values
+    if isinstance(q_interval, tuple):
+        q_interval = [q_interval]
+    q_interval = np.array(q_interval)
+    q_lo = q_interval[:, 0]
+    q_hi = q_interval[:, 1]
+    y_pred_lo = np.quantile(y_pred, q_lo, axis=2).transpose(1, 2, 0)
+    y_pred_hi = np.quantile(y_pred, q_hi, axis=2).transpose(1, 2, 0)
+    res = y_pred_hi - y_pred_lo
+    return res.reshape(len(y_pred), -1)
+
+
 class TestMetrics:
     np.random.seed(42)
     pd_train = pd.Series(
@@ -963,6 +977,15 @@ def test_arre(self, config):
         self.helper_test_nan(metric, **kwargs)
         self.helper_test_non_aggregate(metric, is_aggregate)
 
+        with pytest.raises(ValueError) as exc:
+            _ = metric(
+                TimeSeries.from_values(np.ones((3, 1, 1))),
+                TimeSeries.from_values(np.ones((3, 1, 1))),
+            )
+        assert str(exc.value).startswith(
+            "The difference between the max and min values must "
+        )
+
     @pytest.mark.parametrize(
         "metric",
         [
@@ -1034,10 +1057,10 @@ def test_sape(self, config):
         "config",
         [
             (metrics.ase, False, {"time_reduction": np.nanmean}),
-            # (metrics.sse, False, {"time_reduction": np.nanmean}),
-            # (metrics.mase, True, {}),
-            # (metrics.msse, True, {}),
-            # (metrics.rmsse, True, {}),
+            (metrics.sse, False, {"time_reduction": np.nanmean}),
+            (metrics.mase, True, {}),
+            (metrics.msse, True, {}),
+            (metrics.rmsse, True, {}),
         ],
     )
     def test_scaled_errors(self, config):
@@ -1110,6 +1133,9 @@ def test_scaled_errors(self, config):
         assert str(err.value).startswith(
             "The `insample` series must start before the `pred_series`"
         )
+        # wrong number of components
+        with pytest.raises(ValueError):
+            metric(self.series1, self.series2, insample.stack(insample))
         # multi-ts, second series is not a TimeSeries
         with pytest.raises(ValueError):
             metric([self.series1] * 2, self.series2, [insample] * 2)
@@ -1157,6 +1183,22 @@ def test_rho_risk(self):
         np.testing.assert_array_almost_equal(metrics.qr(s1, s12_stochastic, q=0.0), 0.0)
         np.testing.assert_array_almost_equal(metrics.qr(s2, s12_stochastic, q=1.0), 0.0)
 
+        # preds must be probabilistic
+        q_names = likelihood_component_names(
+            self.series1.components,
+            quantile_names([0.5]),
+        )
+        with pytest.raises(ValueError) as exc:
+            metrics.qr(
+                self.series1,
+                self.series1.with_columns_renamed(self.series1.components, q_names),
+                q=0.5,
+            )
+        assert (
+            str(exc.value)
+            == "quantile risk (qr) should only be computed for stochastic predicted TimeSeries."
+        )
+
     @pytest.mark.parametrize(
         "config",
         [
@@ -1241,7 +1283,7 @@ def test_metrics_arguments(self):
 
         # should fail if kwargs are passed as args, because of the "*"
         with pytest.raises(TypeError):
-            metrics.r2_score(series00, series11, False, np.mean)
+            metrics.r2_score(series00, series11, False, 0.5, np.mean)
 
     def test_multiple_ts_rmse(self):
         # simple test
@@ -1539,3 +1581,549 @@ def helper_test_non_aggregate(self, metric, is_aggregate, val_exp=None):
 
         if val_exp is not None:
             assert (res == -1.0).all()
+
+    @pytest.mark.parametrize(
+        "config",
+        list(
+            itertools.product(
+                [
+                    # time dependent but with time reduction
+                    metrics.err,
+                    metrics.ae,
+                    metrics.se,
+                    metrics.sle,
+                    metrics.ase,
+                    metrics.sse,
+                    metrics.ape,
+                    metrics.sape,
+                    metrics.arre,
+                    metrics.ql,
+                    # time aggregates
+                    metrics.merr,
+                    metrics.mae,
+                    metrics.mse,
+                    metrics.rmse,
+                    metrics.rmsle,
+                    metrics.mase,
+                    metrics.msse,
+                    metrics.rmsse,
+                    metrics.mape,
+                    metrics.smape,
+                    metrics.ope,
+                    metrics.marre,
+                    metrics.r2_score,
+                    metrics.coefficient_of_variation,
+                    metrics.mql,
+                ],
+                [True, False],  # univariate series
+                [True, False],  # single series
+            )
+        ),
+    )
+    def test_metric_quantiles(self, config):
+        """Test output types and shapes for time aggregated metrics with quantiles:
+        for single and multiple univariate or multivariate series, in combination
+        with different component and series reduction functions."""
+        np.random.seed(42)
+        metric, is_univar, is_single = config
+        params = inspect.signature(metric).parameters
+
+        n_comp = 1 if is_univar else 2
+
+        qs_all = [0.1, 0.5, 0.8]
+        components = [str(i) for i in range(n_comp)]
+
+        series_vals = np.random.random((10, n_comp, 1))
+
+        pred_prob_vals = np.random.random((10, n_comp, 100))
+
+        pred_vals_qs = []
+        for i in range(n_comp):
+            pred_vals_qs.append(
+                np.quantile(pred_prob_vals[:, [i]], qs_all, axis=2).transpose(1, 0, 2)
+            )
+        pred_vals_qs = np.concatenate(pred_vals_qs, axis=1)
+        pred_components = likelihood_component_names(
+            components=components, parameter_names=quantile_names(q=qs_all)
+        )
+
+        series = TimeSeries.from_values(series_vals, columns=components)
+        series_q_exp = concatenate(
+            [series[comp] for comp in components for _ in qs_all], axis=1
+        )
+        pred_prob = TimeSeries.from_values(pred_prob_vals, columns=components)
+        pred_qs = TimeSeries.from_values(pred_vals_qs, columns=pred_components)
+        insample = series.shift(-len(series))
+        insample_q_exp = concatenate(
+            [insample[comp] for comp in components for _ in qs_all], axis=1
+        )
+        shape_time = (len(pred_qs),) if "time_reduction" in params else tuple()
+
+        if not is_single:
+            series = [series] * 2
+            series_q_exp = [series_q_exp] * 2
+            pred_prob = [pred_prob] * 2
+            pred_qs = [pred_qs] * 2
+            insample = [insample] * 2
+            insample_q_exp = [insample_q_exp] * 2
+
+        kwargs = {"actual_series": series}
+        if "insample" in params:
+            kwargs["insample"] = insample
+
+        def check_res(
+            pred_prob_, pred_qs_, shape_exp, series_reduction=None, **test_kwargs
+        ):
+            res_prob = metric(
+                pred_series=pred_prob_,
+                series_reduction=series_reduction,
+                **kwargs,
+                **test_kwargs,
+            )
+            res_qs = metric(
+                pred_series=pred_qs_,
+                series_reduction=series_reduction,
+                **kwargs,
+                **test_kwargs,
+            )
+            if is_single or series_reduction is not None:
+                res_prob = [res_prob]
+                res_qs = [res_qs]
+            if series_reduction is None and not is_single:
+                assert len(res_prob) == len(res_qs) == len(pred_prob_)
+
+            for res_p, res_q in zip(res_prob, res_qs):
+                assert res_p.shape == res_q.shape == shape_exp
+                np.testing.assert_array_almost_equal(res_p, res_q)
+
+        check_res(pred_prob, pred_qs, shape_time, q=0.1)
+        # one quantile as list
+        check_res(pred_prob, pred_qs, shape_time, q=[0.1])
+        # multiple quantiles
+        check_res(pred_prob, pred_qs, shape_time + (2,), q=[0.1, 0.8])
+        # all quantiles
+        check_res(pred_prob, pred_qs, shape_time + (3,), q=[0.1, 0.5, 0.8])
+        qs = [0.1, 0.8]
+        # component and series reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(qs),),
+            q=qs,
+            component_reduction=np.mean,
+            series_reduction=np.mean,
+        )
+        # no component reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(qs) * n_comp,),
+            q=qs,
+            component_reduction=None,
+            series_reduction=np.mean,
+        )
+        # no series reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(qs),),
+            q=qs,
+            component_reduction=np.mean,
+            series_reduction=None,
+        )
+        # no series and component reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(qs) * n_comp,),
+            q=qs,
+            component_reduction=None,
+            series_reduction=None,
+        )
+
+        # check that we get identical results as when computing each quantile component against the actual
+        # target component directly
+        kwargs_direct = copy.deepcopy(kwargs)
+        q_direct = {}
+        if metric.__name__ not in ["ql", "mql"]:
+            kwargs_direct["actual_series"] = series_q_exp
+            if "insample" in params:
+                kwargs_direct["insample"] = insample_q_exp
+        else:
+            q_direct["q"] = qs_all
+            kwargs_direct["actual_series"] = series
+
+        res_direct = metric(
+            pred_series=pred_qs, component_reduction=None, **kwargs_direct, **q_direct
+        )
+        res_qs = metric(
+            pred_series=pred_qs,
+            component_reduction=None,
+            q=qs_all,
+            **kwargs,
+        )
+        np.testing.assert_array_almost_equal(res_direct, res_qs)
+
+    def test_invalid_quantiles(self):
+        np.random.seed(42)
+        series_a = TimeSeries.from_values(np.random.random((10, 2, 1)))
+        series_b = TimeSeries.from_values(np.random.random((10, 2, 10)))
+
+        # unsorted quantiles
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.mae(series_a, series_b, q=[0.2, 0.1])
+        assert "a sequence of increasing order" in str(exc.value)
+
+        # non-unique values metrics
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.mae(series_a, series_b, q=[0.2, 0.2])
+        assert "with unique values only" in str(exc.value)
+
+        # q > 1
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.mae(series_a, series_b, q=[0.2, 1.01])
+        assert "must be in the range `(>=0,<=1)`" in str(exc.value)
+
+        # q < 0
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.mae(series_a, series_b, q=[-0.01, 0.2])
+        assert "must be in the range `(>=0,<=1)`" in str(exc.value)
+
+        # but sorted, unique, and valid quantiles work
+        _ = metrics.mae(series_a, series_b, q=[0.0, 0.5, 1.0])
+
+    def test_quantile_as_tuple(self):
+        """Test that `q` as tuple (list of quantiles, quantile component names) gives same results as `q`
+        as quantile values list."""
+        np.random.seed(42)
+        q = [0.25, 0.75]
+
+        series_a = TimeSeries.from_values(np.random.random((10, 2, 1)))
+        q_names = pd.Index(
+            likelihood_component_names(series_a.components, quantile_names(q))
+        )
+        series_b = TimeSeries.from_values(np.random.random((10, 4, 1)), columns=q_names)
+
+        np.testing.assert_array_almost_equal(
+            metrics.mae(series_a, series_b, q=(q, q_names)),
+            metrics.mae(series_a, series_b, q=q),
+        )
+
+    def test_custom_metric_wrong_output_shape(self):
+        """Test that custom metrics must have correct output dim."""
+
+        @metrics.multi_ts_support
+        @metrics.multivariate_support
+        def custom_metric(
+            actual_series,
+            pred_series,
+            intersect=True,
+            *,
+            q=None,
+            time_reduction=None,
+            component_reduction=np.nanmean,
+            series_reduction=None,
+            n_jobs=1,
+            verbose=False,
+            out_ndim=1,
+        ):
+            return np.ones(tuple(1 for _ in range(out_ndim)))
+
+        for ndim in [1, 4]:
+            with pytest.raises(ValueError) as exc:
+                custom_metric(self.series1, self.series2, out_ndim=ndim)
+            assert str(exc.value).startswith(
+                "Metric output must have 2 dimensions (n components, n quantiles) for aggregated metrics"
+            )
+        for ndim in [2, 3]:
+            _ = custom_metric(self.series1, self.series2, out_ndim=ndim)
+
+    def test_wrong_error_scale(self):
+        with pytest.raises(ValueError) as exc:
+            _ = metrics._get_error_scale(
+                self.series1.shift(-len(self.series1)),
+                self.series1,
+                m=1,
+                metric="wrong_metric",
+            )
+        assert str(exc.value).startswith("unknown `metric=wrong_metric`")
+
+    @pytest.mark.parametrize(
+        "config",
+        [
+            # only time dependent quantile interval metrics
+            (metrics.iw, metric_iw),
+        ],
+    )
+    def test_metric_quantile_interval_accuracy(self, config):
+        """Test output types and shapes for time dependent metrics with quantile intervals:
+        for single and multiple univariate or multivariate series, in combination
+        with different component and series reduction functions."""
+        np.random.seed(42)
+        metric, metric_ref = config
+        n_comp = 2
+        components = [str(i) for i in range(n_comp)]
+        series_vals = np.random.random((10, n_comp, 1))
+        pred_prob_vals = np.random.random((10, n_comp, 100))
+        series = TimeSeries.from_values(series_vals, columns=components)
+        pred_prob = TimeSeries.from_values(pred_prob_vals, columns=components)
+
+        def check_ref(**test_kwargs):
+            res_prob = metric(
+                actual_series=series,
+                pred_series=pred_prob,
+                series_reduction=None,
+                component_reduction=None,
+                time_reduction=None,
+                **test_kwargs,
+            )
+            res_ref = metric_ref(
+                y_true=series.all_values(),
+                y_pred=pred_prob.all_values(),
+                **test_kwargs,
+            )
+            np.testing.assert_array_almost_equal(res_prob, res_ref)
+
+        # one interval as tuple
+        check_ref(q_interval=(0.1, 0.5))
+        # one interval in list
+        check_ref(q_interval=[(0.1, 0.5)])
+        # multiple intervals
+        check_ref(q_interval=[(0.1, 0.5), (0.5, 0.8)])
+
+    @pytest.mark.parametrize(
+        "config",
+        list(
+            itertools.product(
+                [
+                    # time dependent but with time reduction
+                    metrics.iw,
+                    metrics.miw,
+                ],
+                [True, False],  # univariate series
+                [True, False],  # single series
+            )
+        ),
+    )
+    def test_metric_quantile_interval(self, config):
+        """Test output types and shapes for time aggregated metrics with quantile intervals:
+        for single and multiple univariate or multivariate series, in combination
+        with different component and series reduction functions."""
+        np.random.seed(42)
+        metric, is_univar, is_single = config
+        params = inspect.signature(metric).parameters
+
+        n_comp = 1 if is_univar else 2
+
+        qs_all = [0.1, 0.5, 0.8]
+        components = [str(i) for i in range(n_comp)]
+
+        series_vals = np.random.random((10, n_comp, 1))
+        pred_prob_vals = np.random.random((10, n_comp, 100))
+
+        pred_vals_qs = []
+        for i in range(n_comp):
+            pred_vals_qs.append(
+                np.quantile(pred_prob_vals[:, [i]], qs_all, axis=2).transpose(1, 0, 2)
+            )
+        pred_vals_qs = np.concatenate(pred_vals_qs, axis=1)
+        pred_components = likelihood_component_names(
+            components=components, parameter_names=quantile_names(q=qs_all)
+        )
+
+        series = TimeSeries.from_values(series_vals, columns=components)
+        pred_prob = TimeSeries.from_values(pred_prob_vals, columns=components)
+        pred_qs = TimeSeries.from_values(pred_vals_qs, columns=pred_components)
+        shape_time = (len(pred_qs),) if "time_reduction" in params else tuple()
+
+        if not is_single:
+            series = [series] * 2
+            pred_prob = [pred_prob] * 2
+            pred_qs = [pred_qs] * 2
+
+        kwargs = {"actual_series": series}
+
+        def check_res(
+            pred_prob_, pred_qs_, shape_exp, series_reduction=None, **test_kwargs
+        ):
+            res_prob = metric(
+                actual_series=series,
+                pred_series=pred_prob_,
+                series_reduction=series_reduction,
+                **test_kwargs,
+            )
+            res_qs = metric(
+                actual_series=series,
+                pred_series=pred_qs_,
+                series_reduction=series_reduction,
+                **test_kwargs,
+            )
+            if is_single or series_reduction is not None:
+                res_prob = [res_prob]
+                res_qs = [res_qs]
+            if series_reduction is None and not is_single:
+                assert len(res_prob) == len(res_qs) == len(pred_prob_)
+
+            for res_p, res_q in zip(res_prob, res_qs):
+                assert res_p.shape == res_q.shape == shape_exp
+                np.testing.assert_array_almost_equal(res_p, res_q)
+            return res_qs
+
+        # one interval as tuple
+        res = check_res(pred_prob, pred_qs, shape_time, q_interval=(0.1, 0.5))
+        # one interval in list
+        res2 = check_res(pred_prob, pred_qs, shape_time, q_interval=[(0.1, 0.5)])
+        np.testing.assert_array_almost_equal(res, res2)
+        # multiple intervals
+        check_res(
+            pred_prob, pred_qs, shape_time + (2,), q_interval=[(0.1, 0.5), (0.5, 0.8)]
+        )
+        # all intervals
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (3,),
+            q_interval=[(0.1, 0.5), (0.5, 0.8), (0.1, 0.8)],
+        )
+        q_intervals = [(0.1, 0.5), (0.5, 0.8)]
+        # component and series reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(q_intervals),),
+            q_interval=q_intervals,
+            component_reduction=np.mean,
+            series_reduction=np.mean,
+        )
+        # no component reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(q_intervals) * n_comp,),
+            q_interval=q_intervals,
+            component_reduction=None,
+            series_reduction=np.mean,
+        )
+        # no series reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(q_intervals),),
+            q_interval=q_intervals,
+            component_reduction=np.mean,
+            series_reduction=None,
+        )
+        # no series and component reduction
+        check_res(
+            pred_prob,
+            pred_qs,
+            shape_time + (len(q_intervals) * n_comp,),
+            q_interval=q_intervals,
+            component_reduction=None,
+            series_reduction=None,
+        )
+
+        # check that we get identical results as when computing intervals separately (on the time aggregated case)
+        if "time_reduction" in params:
+            kwargs["time_reduction"] = np.mean
+        res_lo = metric(
+            pred_series=pred_qs,
+            component_reduction=None,
+            q_interval=(0.1, 0.5),
+            **kwargs,
+        )
+        res_hi = metric(
+            pred_series=pred_qs,
+            component_reduction=None,
+            q_interval=(0.5, 0.8),
+            **kwargs,
+        )
+        res_multi = metric(
+            pred_series=pred_qs,
+            component_reduction=None,
+            q_interval=[(0.1, 0.5), (0.5, 0.8)],
+            **kwargs,
+        )
+        if is_single:
+            res_lo = [res_lo]
+            res_hi = [res_hi]
+            res_multi = [res_multi]
+        res_lo_hi = []
+        for res_lo_, res_hi_ in zip(res_lo, res_hi):
+            if res_lo_.ndim == 0:
+                res_lo_ = np.expand_dims(res_lo_, -1)
+                res_hi_ = np.expand_dims(res_hi_, -1)
+                res_lo_hi_ = np.concatenate([res_lo_, res_hi_])
+            else:
+                res_lo_hi_ = np.concatenate(
+                    [(res_lo_[i], res_hi_[i]) for i in range(n_comp)],
+                )
+            res_lo_hi.append(res_lo_hi_)
+        np.testing.assert_array_almost_equal(res_lo_hi, res_multi)
+
+    def test_invalid_quantile_intervals(self):
+        np.random.seed(42)
+        series_a = TimeSeries.from_values(np.random.random((10, 2, 1)))
+        series_b = TimeSeries.from_values(np.random.random((10, 2, 10)))
+
+        # q not supported
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q=[0.2])
+        assert str(exc.value).startswith(
+            "`q` is not supported for quantile interval metrics"
+        )
+
+        # no quantile interval
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q_interval=None)
+        assert str(exc.value).startswith(
+            "Quantile interval metrics require setting `q_interval`."
+        )
+
+        # invalid interval type
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q_interval=0.6)
+        assert (
+            str(exc.value)
+            == "`q_interval` must be a tuple (float, float) or a sequence of tuples (float, float)."
+        )
+
+        # invalid tuple length
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q_interval=(0.1, 0.2, 0.3))
+        assert (
+            str(exc.value)
+            == "`q_interval` must be a tuple (float, float) or a sequence of tuples (float, float)."
+        )
+
+        # one tuple has invalid length invalid tuple length (raises a numpy error)
+        with pytest.raises(ValueError):
+            _ = metrics.iw(series_a, series_b, q_interval=[(0.1, 0.2), (0.2, 0.3, 0.4)])
+
+        # interval upper bound too high
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q_interval=(0.1, 1.1))
+        assert str(exc.value).startswith(
+            "All `q` values must be in the range `(>=0,<=1)`."
+        )
+
+        # interval lower bound too low
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q_interval=(-0.01, 0.1))
+        assert str(exc.value).startswith(
+            "All `q` values must be in the range `(>=0,<=1)`."
+        )
+
+        # lower interval equal to higher interval
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q_interval=(0.2, 0.2))
+        assert str(exc.value).startswith(
+            "all intervals in `q_interval` must be tuples of (lower q, upper q)"
+        )
+
+        # lower interval higher than higher interval
+        with pytest.raises(ValueError) as exc:
+            _ = metrics.iw(series_a, series_b, q_interval=(0.3, 0.2))
+        assert str(exc.value).startswith(
+            "all intervals in `q_interval` must be tuples of (lower q, upper q)"
+        )
diff --git a/darts/tests/models/forecasting/test_backtesting.py b/darts/tests/models/forecasting/test_backtesting.py
index 0f6336043a..60f1b3e525 100644
--- a/darts/tests/models/forecasting/test_backtesting.py
+++ b/darts/tests/models/forecasting/test_backtesting.py
@@ -26,6 +26,7 @@
 from darts.utils.timeseries_generation import linear_timeseries as lt
 from darts.utils.timeseries_generation import random_walk_timeseries as rt
 from darts.utils.timeseries_generation import sine_timeseries as st
+from darts.utils.utils import generate_index
 
 logger = get_logger(__name__)
 
@@ -1252,6 +1253,179 @@ def time_reduced_metric(*args, **kwargs):
             for bt in bt_list:
                 np.testing.assert_array_almost_equal(bt, bt_expected)
 
+    @pytest.mark.parametrize(
+        "config",
+        itertools.product(
+            [
+                [metrics.mae],  # mae does not support time_reduction
+                [metrics.mae, metrics.ae],  # ae supports time_reduction
+                [metrics.miw],  # quantile interval metric
+                [metrics.miw, metrics.iw],
+            ],
+            [True, False],  # last_points_only
+        ),
+    )
+    def test_metric_quantiles_lpo(self, config):
+        """Tests backtest with quantile and quantile interval metrics from expected probabilistic or quantile
+        historical forecasts."""
+        metric, lpo = config
+        is_interval_metric = metric[0].__name__ == "miw"
+
+        q = [0.05, 0.5, 0.60, 0.95]
+        q_interval = [(0.05, 0.50), (0.50, 0.60), (0.60, 0.95), (0.05, 0.60)]
+
+        y = lt(length=20)
+        y = y.stack(y + 1.0)
+        hfc = TimeSeries.from_times_and_values(
+            times=generate_index(start=y.start_time() + 10 * y.freq, length=10),
+            values=np.random.random((10, 1, 100)),
+        )
+        hfc = hfc.stack(hfc + 1.0)
+        y = [y, y]
+        if lpo:
+            hfc = [hfc, hfc]
+        else:
+            hfc = [[hfc, hfc], [hfc]]
+
+        metric_kwargs = [{"component_reduction": np.median}]
+        if not is_interval_metric:
+            metric_kwargs[0]["q"] = q
+        else:
+            metric_kwargs[0]["q_interval"] = q_interval
+        if len(metric) > 1:
+            # give metric specific kwargs
+            metric_kwargs2 = {
+                "component_reduction": np.median,
+                "time_reduction": np.mean,
+            }
+            if not is_interval_metric:
+                metric_kwargs2["q"] = q
+            else:
+                metric_kwargs2["q_interval"] = q_interval
+            metric_kwargs.append(metric_kwargs2)
+
+        model = NaiveDrift()
+
+        bts = model.backtest(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            reduction=None,
+            metric_kwargs=metric_kwargs,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        if lpo:
+            bts = [[bt] for bt in bts]
+        # `ae` with time and component reduction is equal to `mae` with component reduction
+        hfc_single = hfc[0][0] if not lpo else hfc[0]
+        q_kwargs = {"q": q} if not is_interval_metric else {"q_interval": q_interval}
+        bt_expected = metric[0](
+            y[0], hfc_single, component_reduction=np.median, **q_kwargs
+        )
+        shape_expected = (len(q),)
+        if len(metric) > 1:
+            bt_expected = np.concatenate([bt_expected[:, None]] * 2, axis=1)
+            shape_expected += (len(metric),)
+        for bt_list in bts:
+            for bt in bt_list:
+                assert bt.shape == shape_expected
+                np.testing.assert_array_almost_equal(bt, bt_expected)
+
+        bts = model.backtest(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            reduction=np.mean,
+            metric_kwargs=metric_kwargs,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        for bt in bts:
+            assert bt.shape == shape_expected
+            np.testing.assert_array_almost_equal(bt, bt_expected)
+
+        def time_reduced_metric(*args, **kwargs):
+            metric_f = metrics.iw if is_interval_metric else metrics.ae
+            return metric_f(*args, **kwargs, time_reduction=np.mean)
+
+        # check that single kwargs can be used for all metrics if params are supported
+        metric = [metric[0], time_reduced_metric]
+        bts = model.backtest(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            reduction=None,
+            metric_kwargs=metric_kwargs[0],
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        if lpo:
+            bts = [[bt] for bt in bts]
+        # `ae` / `miw` with time and component reduction is equal to `mae` / `miw` with component reduction
+        bt_expected = metric[0](
+            y[0], hfc_single, component_reduction=np.median, **q_kwargs
+        )
+        bt_expected = np.concatenate([bt_expected[:, None]] * 2, axis=1)
+        shape_expected = (len(q), len(metric))
+        for bt_list in bts:
+            for bt in bt_list:
+                assert bt.shape == shape_expected
+                np.testing.assert_array_almost_equal(bt, bt_expected)
+
+        bts = model.backtest(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            reduction=np.mean,
+            metric_kwargs=metric_kwargs[0],
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        for bt in bts:
+            assert bt.shape == shape_expected
+            np.testing.assert_array_almost_equal(bt, bt_expected)
+
+        # without component reduction
+        metric_kwargs = {"component_reduction": None}
+        if not is_interval_metric:
+            metric_kwargs["q"] = q
+        else:
+            metric_kwargs["q_interval"] = q_interval
+        bts = model.backtest(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            reduction=None,
+            metric_kwargs=metric_kwargs,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        if lpo:
+            bts = [[bt] for bt in bts]
+
+        # `ae` / `iw` with time and no component reduction is equal to `mae` / `miw` without component reduction
+        bt_expected = metric[0](y[0], hfc_single, **metric_kwargs)
+        bt_expected = np.concatenate([bt_expected[:, None]] * 2, axis=1)
+        shape_expected = (len(q) * y[0].width, len(metric))
+        for bt_list in bts:
+            for bt in bt_list:
+                assert bt.shape == shape_expected
+                np.testing.assert_array_almost_equal(bt, bt_expected)
+
+        bts = model.backtest(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            reduction=np.mean,
+            metric_kwargs=metric_kwargs,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        for bt in bts:
+            assert bt.shape == shape_expected
+            np.testing.assert_array_almost_equal(bt, bt_expected)
+
     @pytest.mark.parametrize(
         "config",
         product([True, False], [True, False]),
diff --git a/darts/tests/models/forecasting/test_residuals.py b/darts/tests/models/forecasting/test_residuals.py
index 0e975240c4..40d15ebf0e 100644
--- a/darts/tests/models/forecasting/test_residuals.py
+++ b/darts/tests/models/forecasting/test_residuals.py
@@ -1,15 +1,23 @@
 import itertools
 
 import numpy as np
+import pandas as pd
 import pytest
 
 import darts.metrics as metrics
+from darts import TimeSeries, concatenate
 from darts.datasets import AirPassengersDataset
 from darts.logging import get_logger
 from darts.models import LinearRegressionModel, NaiveDrift, NaiveSeasonal
 from darts.tests.models.forecasting.test_regression_models import dummy_timeseries
 from darts.utils.timeseries_generation import constant_timeseries as ct
 from darts.utils.timeseries_generation import linear_timeseries as lt
+from darts.utils.utils import (
+    generate_index,
+    likelihood_component_names,
+    quantile_interval_names,
+    quantile_names,
+)
 
 logger = get_logger(__name__)
 
@@ -612,3 +620,260 @@ def test_sample_weight(self, config):
         for res_nw, res_w in zip(res_non_weighted, res_weighted):
             with pytest.raises(AssertionError):
                 np.testing.assert_array_almost_equal(res_w, res_nw)
+
+    @pytest.mark.parametrize(
+        "config",
+        itertools.product(
+            [metrics.ae, metrics.iw],  # quantile (interval) metrics
+            [True, False],  # last_points_only
+            [False, True],  # from stochastic predictions (or predicted quantiles)
+        ),
+    )
+    def test_residuals_with_quantiles_metrics(self, config):
+        """Tests residuals with quantile metrics from expected probabilistic or quantile historical forecasts."""
+        metric, lpo, stochastic_pred = config
+        is_interval_metric = metric.__name__ == "iw"
+
+        # multi-quantile metrics yield more components
+        q = [0.05, 0.50, 0.60, 0.95]
+        q_interval = [(0.05, 0.50), (0.50, 0.60), (0.60, 0.95), (0.05, 0.60)]
+
+        y = lt(length=20)
+        y = y.stack(y + 1.0)
+        if not is_interval_metric:
+            q_comp_names_expected = pd.Index(
+                likelihood_component_names(
+                    components=y.components,
+                    parameter_names=quantile_names(q),
+                )
+            )
+        else:
+            q_comp_names_expected = pd.Index(
+                likelihood_component_names(
+                    components=y.components,
+                    parameter_names=quantile_interval_names(q_interval),
+                )
+            )
+        # historical forecasts
+        vals = np.random.random((10, 1, 100))
+        if not stochastic_pred:
+            vals = np.quantile(vals, q, axis=2).transpose((1, 0, 2))
+            comp_names = pd.Index(
+                likelihood_component_names(
+                    components=y.components,
+                    parameter_names=quantile_names(q=q),
+                )
+            )
+        else:
+            comp_names = y.components
+        vals = np.concatenate([vals, vals + 1], axis=1)
+        hfc = TimeSeries.from_times_and_values(
+            times=generate_index(start=y.start_time() + 10 * y.freq, length=10),
+            values=vals,
+            columns=comp_names,
+        )
+
+        y = [y, y]
+        if lpo:
+            hfc = [hfc, hfc]
+        else:
+            hfc = [[hfc, hfc], [hfc]]
+
+        metric_kwargs = {"component_reduction": None}
+        if not is_interval_metric:
+            metric_kwargs["q"] = q
+        else:
+            metric_kwargs["q_interval"] = q_interval
+
+        model = NaiveDrift()
+
+        # return TimeSeries
+        bts = model.residuals(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            metric_kwargs=metric_kwargs,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        if lpo:
+            bts = [[bt] for bt in bts]
+
+        # `ae` with time and component reduction is equal to `mae` with component reduction
+        hfc_single = hfc[0][0] if not lpo else hfc[0]
+        bt_expected = metric(y[0], hfc_single, **metric_kwargs)
+        shape_expected = (len(hfc_single), len(q) * y[0].n_components)
+        for bt_list in bts:
+            for bt in bt_list:
+                assert bt.shape[:2] == shape_expected
+                assert bt.components.equals(q_comp_names_expected)
+                np.testing.assert_array_almost_equal(bt.values(), bt_expected)
+
+        # values only
+        bts = model.residuals(
+            series=y,
+            historical_forecasts=hfc,
+            metric=metric,
+            last_points_only=lpo,
+            metric_kwargs=metric_kwargs,
+            values_only=True,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        if lpo:
+            bts = [[bt] for bt in bts]
+
+        # `ae` with time and component reduction is equal to `mae` with component reduction
+        for bt_list in bts:
+            for bt in bt_list:
+                assert bt.shape[:2] == shape_expected
+                np.testing.assert_array_almost_equal(bt[:, :, 0], bt_expected)
+
+    @pytest.mark.parametrize(
+        "config",
+        list(
+            itertools.product(
+                [metrics.ae, metrics.iw],  # quantile (interval) metrics
+                [True, False],  # last_points_only
+            )
+        ),
+    )
+    def test_quantiles_from_model(self, config):
+        """Tests residuals from quantile regression model works for both direct likelihood parameter prediction or
+        sampled prediction by giving the correct metrics kwargs."""
+        metric, lpo = config
+
+        is_interval_metric = metric.__name__ == "iw"
+
+        # multi-quantile metrics yield more components
+        q = [0.05, 0.50, 0.95]
+        q_interval = [(0.05, 0.50), (0.50, 0.95), (0.05, 0.95)]
+
+        y = lt(length=20)
+        y = y.stack(y + 1.0)
+        if not is_interval_metric:
+            q_comp_names_expected = pd.Index(
+                likelihood_component_names(
+                    components=y.components,
+                    parameter_names=quantile_names(q),
+                )
+            )
+        else:
+            q_comp_names_expected = pd.Index(
+                likelihood_component_names(
+                    components=y.components,
+                    parameter_names=quantile_interval_names(q_interval),
+                )
+            )
+        y = [y, y]
+        metric_kwargs = {"component_reduction": None}
+        if not is_interval_metric:
+            metric_kwargs["q"] = q
+        else:
+            metric_kwargs["q_interval"] = q_interval
+
+        icl = 3
+        model = LinearRegressionModel(
+            lags=icl, output_chunk_length=1, likelihood="quantile", quantiles=q
+        )
+        model.fit(y)
+
+        # quantile forecasts
+        bts = model.residuals(
+            series=y,
+            forecast_horizon=1,
+            metric=metric,
+            last_points_only=lpo,
+            metric_kwargs=metric_kwargs,
+            predict_likelihood_parameters=True,
+            retrain=False,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        if not lpo:
+            bts = [concatenate(bt, axis=0) for bt in bts]
+
+        # `ae` with time and component reduction is equal to `mae` with component reduction
+        shape_expected = (len(y[0]) - icl, len(q) * y[0].n_components)
+        for bt in bts:
+            assert bt.shape[:2] == shape_expected
+            assert bt.components.equals(q_comp_names_expected)
+
+        # probabilistic forecasts
+        bts_prob = model.residuals(
+            series=y,
+            forecast_horizon=1,
+            metric=metric,
+            last_points_only=lpo,
+            metric_kwargs=metric_kwargs,
+            predict_likelihood_parameters=False,
+            num_samples=1000,
+            retrain=False,
+        )
+        assert isinstance(bts_prob, list) and len(bts_prob) == 2
+        if not lpo:
+            bts_prob = [concatenate(bt, axis=0) for bt in bts_prob]
+        for bt_p, bt_q in zip(bts_prob, bts):
+            assert bt_p.shape == bt_q.shape
+            assert bt_p.components.equals(bt_q.components)
+            # check that the results are similar
+            assert np.abs(bt_p.all_values() - bt_q.all_values()).max() < 0.1
+
+        # single quantile
+        q_single = 0.05
+        q_interval_single = (0.05, 0.50)
+        metric_kwargs = {"component_reduction": None}
+        if not is_interval_metric:
+            metric_kwargs["q"] = q_single
+        else:
+            metric_kwargs["q_interval"] = q_interval_single
+        bts = model.residuals(
+            series=y,
+            forecast_horizon=1,
+            metric=metric,
+            last_points_only=lpo,
+            metric_kwargs=metric_kwargs,
+            predict_likelihood_parameters=True,
+            retrain=False,
+        )
+        assert isinstance(bts, list) and len(bts) == 2
+        if not lpo:
+            bts = [concatenate(bt, axis=0) for bt in bts]
+
+        # `ae` with time and component reduction is equal to `mae` with component reduction
+        shape_expected = (len(y[0]) - icl, y[0].n_components)
+        for bt in bts:
+            assert bt.shape[:2] == shape_expected
+            assert bt.components.equals(
+                pd.Index(
+                    likelihood_component_names(
+                        y[0].components,
+                        parameter_names=(
+                            quantile_names([q_single])
+                            if not is_interval_metric
+                            else quantile_interval_names([q_interval_single])
+                        ),
+                    )
+                )
+            )
+
+        # wrong quantile
+        q_wrong = [0.99]
+        q_interval_wrong = (0.05, 0.99)
+        metric_kwargs = {"component_reduction": None}
+        if not is_interval_metric:
+            metric_kwargs["q"] = q_wrong
+        else:
+            metric_kwargs["q_interval"] = q_interval_wrong
+        with pytest.raises(ValueError) as exc:
+            _ = model.residuals(
+                series=y,
+                forecast_horizon=1,
+                metric=metric,
+                last_points_only=lpo,
+                metric_kwargs=metric_kwargs,
+                predict_likelihood_parameters=True,
+                retrain=False,
+            )
+        assert str(exc.value).startswith(
+            f"Computing a metric with quantile(s) "
+            f"`q={'[0.99]' if not is_interval_metric else '[0.05 0.99]'}` is only supported"
+        )
diff --git a/darts/tests/utils/test_utils.py b/darts/tests/utils/test_utils.py
index 3b82c5f1b2..d629851cea 100644
--- a/darts/tests/utils/test_utils.py
+++ b/darts/tests/utils/test_utils.py
@@ -7,7 +7,15 @@
 from darts.utils import _with_sanity_checks
 from darts.utils.missing_values import extract_subseries
 from darts.utils.ts_utils import retain_period_common_to_all
-from darts.utils.utils import expand_arr, freqs, generate_index, n_steps_between
+from darts.utils.utils import (
+    expand_arr,
+    freqs,
+    generate_index,
+    likelihood_component_names,
+    n_steps_between,
+    quantile_interval_names,
+    quantile_names,
+)
 
 
 class TestUtils:
@@ -587,3 +595,39 @@ def test_expand_arr(self, config):
         arr = expand_arr(arr, ndim=3)
         assert arr.shape == shape_expected
         np.testing.assert_array_almost_equal(arr, arr_expected)
+
+    def test_likelihood_component_names(self):
+        names = likelihood_component_names(["a", "b"], ["1", "2", "3"])
+        assert names == ["a_1", "a_2", "a_3", "b_1", "b_2", "b_3"]
+
+        assert (
+            likelihood_component_names(pd.Index(["a", "b"]), ["1", "2", "3"]) == names
+        )
+
+    @pytest.mark.parametrize(
+        "config",
+        [
+            (0.25, "a_q0.25"),
+            (0.2501, "a_q0.25"),
+            ([0.25], ["a_q0.25"]),
+            ([0.25, 0.75], ["a_q0.25", "a_q0.75"]),
+        ],
+    )
+    def test_quantile_names(self, config):
+        q, names_expected = config
+        names = quantile_names(q, "a")
+        assert names == names_expected
+
+    @pytest.mark.parametrize(
+        "config",
+        [
+            ((0.25, 0.5), "a_q0.25_q0.50"),
+            ((0.2501, 0.4999), "a_q0.25_q0.50"),
+            ([(0.25, 0.5)], ["a_q0.25_q0.50"]),
+            ([(0.25, 0.50), (0.6, 0.75)], ["a_q0.25_q0.50", "a_q0.60_q0.75"]),
+        ],
+    )
+    def test_quantile_interval_names(self, config):
+        q, names_expected = config
+        names = quantile_interval_names(q, "a")
+        assert names == names_expected
diff --git a/darts/timeseries.py b/darts/timeseries.py
index edadecb422..f6fe161464 100644
--- a/darts/timeseries.py
+++ b/darts/timeseries.py
@@ -74,6 +74,7 @@
 # dimension names in the DataArray
 # the "time" one can be different, if it has a name in the underlying Series/DataFrame.
 DIMS = ("time", "component", "sample")
+AXES = {"time": 0, "component": 1, "sample": 2}
 
 VALID_INDEX_TYPES = (pd.DatetimeIndex, pd.RangeIndex)
 STATIC_COV_TAG = "static_covariates"
@@ -1361,15 +1362,20 @@ def bottom_level_series(self) -> Optional[List[Self]]:
             else None
         )
 
+    @property
+    def shape(self) -> Tuple[int]:
+        """The shape of the series (n_timesteps, n_components, n_samples)."""
+        return self._xa.shape
+
     @property
     def n_samples(self) -> int:
         """Number of samples contained in the series."""
-        return len(self._xa.sample)
+        return self.shape[AXES["sample"]]
 
     @property
     def n_components(self) -> int:
         """Number of components (dimensions) contained in the series."""
-        return len(self._xa.component)
+        return self.shape[AXES["component"]]
 
     @property
     def width(self) -> int:
@@ -1379,12 +1385,12 @@ def width(self) -> int:
     @property
     def n_timesteps(self) -> int:
         """Number of time steps in the series."""
-        return len(self._time_index)
+        return self.shape[AXES["time"]]
 
     @property
     def is_deterministic(self) -> bool:
         """Whether this series is deterministic."""
-        return self.n_samples == 1
+        return self.shape[AXES["sample"]] == 1
 
     @property
     def is_stochastic(self) -> bool:
@@ -1399,7 +1405,7 @@ def is_probabilistic(self) -> bool:
     @property
     def is_univariate(self) -> bool:
         """Whether this series is univariate."""
-        return self.n_components == 1
+        return self.shape[AXES["component"]] == 1
 
     @property
     def freq(self) -> Union[pd.DateOffset, int]:
diff --git a/darts/utils/likelihood_models.py b/darts/utils/likelihood_models.py
index 6d4d7fd1a1..75b7d86f07 100644
--- a/darts/utils/likelihood_models.py
+++ b/darts/utils/likelihood_models.py
@@ -60,7 +60,11 @@
 from darts.logging import raise_if_not
 
 # TODO: Table on README listing distribution, possible priors and wiki article
-from darts.utils.utils import _check_quantiles
+from darts.utils.utils import (
+    _check_quantiles,
+    likelihood_component_names,
+    quantile_names,
+)
 
 MIN_CAUCHY_GAMMA_SAMPLING = 1e-100
 
@@ -1354,11 +1358,10 @@ def _params_from_output(self, model_output: torch.Tensor) -> None:
 
     def likelihood_components_names(self, input_series: TimeSeries) -> List[str]:
         """Each component have their own quantiles"""
-        return [
-            f"{tgt_name}_q{quantile:.2f}"
-            for tgt_name in input_series.components
-            for quantile in self.quantiles
-        ]
+        return likelihood_component_names(
+            components=input_series.components,
+            parameter_names=quantile_names(self.quantiles),
+        )
 
     def simplified_name(self) -> str:
         return "quantile"
diff --git a/darts/utils/utils.py b/darts/utils/utils.py
index 125e1b8afc..c7c2480215 100644
--- a/darts/utils/utils.py
+++ b/darts/utils/utils.py
@@ -6,7 +6,7 @@
 from enum import Enum
 from functools import wraps
 from inspect import Parameter, getcallargs, signature
-from typing import Callable, Iterator, List, Optional, Tuple, TypeVar, Union
+from typing import Callable, Iterator, List, Optional, Sequence, Tuple, TypeVar, Union
 
 import numpy as np
 import pandas as pd
@@ -67,6 +67,66 @@ class ModelMode(Enum):
 }
 
 
+def likelihood_component_names(
+    components: Union[pd.Index, List[str]], parameter_names: List[str]
+):
+    """Generates formatted likelihood parameter names for components and parameter names.
+
+    The order of the returned names is: `[comp1_param_1, ... comp1_param_n, ..., comp_n_param_n]`.
+
+    Parameters
+    ----------
+    components
+        A sequence of component names to add to the beginning of the returned names.
+    parameter_names
+        A sequence of likelihood parameter names to add to the end of the returned names.
+    """
+    return [
+        f"{tgt_name}_{param_n}"
+        for tgt_name in components
+        for param_n in parameter_names
+    ]
+
+
+def quantile_names(q: Union[float, List[float]], component: Optional[str] = None):
+    """Generates formatted quantile names, optionally added to a component name.
+
+    Parameters
+    ----------
+    q
+        A float or list of floats with the quantiles to generate the names for.
+    component
+        Optionally, a component name to add to the beginning of the quantile names.
+    """
+    # predicted quantile text format
+    comp = f"{component}_" if component is not None else ""
+    if isinstance(q, float):
+        return f"{comp}q{q:.2f}"
+    else:
+        return [f"{comp}q{q_i:.2f}" for q_i in q]
+
+
+def quantile_interval_names(
+    q_interval: Union[Tuple[float, float], Sequence[Tuple[float, float]]],
+    component: Optional[str] = None,
+):
+    """Generates formatted quantile interval names, optionally added to a component name.
+
+    Parameters
+    ----------
+    q_interval
+        A tuple or multiple tuples with the (lower bound, upper bound) of the quantile intervals.
+    component
+        Optionally, a component name to add to the beginning of the quantile names.
+    """
+    # predicted quantile text format
+    comp = f"{component}_" if component is not None else ""
+    if isinstance(q_interval, tuple):
+        return f"{comp}q{q_interval[0]:.2f}_q{q_interval[1]:.2f}"
+    else:
+        return [f"{comp}q{q_lo:.2f}_q{q_hi:.2f}" for q_lo, q_hi in q_interval]
+
+
 def _build_tqdm_iterator(iterable, verbose, **kwargs):
     """
     Build an iterable, possibly using tqdm (either in notebook or regular mode)