diff --git a/examples/q_learning.py b/examples/q_learning.py
index dbe983d..ac496ec 100644
--- a/examples/q_learning.py
+++ b/examples/q_learning.py
@@ -15,6 +15,7 @@
 import numpy.typing as npt
 from csnlp import Nlp
 from csnlp.wrappers import Mpc
+from gymnasium.spaces import Box
 from gymnasium.wrappers import TimeLimit
 
 from mpcrl import LearnableParameter, LearnableParametersDict, LstdQLearningAgent
@@ -37,6 +38,7 @@ class LtiSystem(gym.Env[npt.NDArray[np.floating], float]):
     a_bnd = (-1, 1)  # bounds of control input
     w = np.asarray([[1e2], [1e2]])  # penalty weight for bound violations
     e_bnd = (-1e-1, 0)  # uniform noise bounds
+    action_space = Box(*a_bnd, (nu,), np.float64)
 
     def reset(
         self,
diff --git a/src/mpcrl/agents/common/agent.py b/src/mpcrl/agents/common/agent.py
index 846fb73..100d081 100644
--- a/src/mpcrl/agents/common/agent.py
+++ b/src/mpcrl/agents/common/agent.py
@@ -75,8 +75,8 @@ def __init__(
             values. Use this to specify fixed parameters, that is, non-learnable. If
             `None`, then no fixed parameter is assumed.
         exploration : ExplorationStrategy, optional
-            Exploration strategy for inducing exploration in the MPC policy. By default
-            `None`, in which case `NoExploration` is used.
+            Exploration strategy for inducing exploration in the online MPC policy. By
+            default `None`, in which case `NoExploration` is used.
         warmstart: "last" or "last-successful" or WarmStartStrategy, optional
             The warmstart strategy for the MPC's NLP. If `last-successful`, the last
             successful solution is used to warm start the solver for the next iteration.
@@ -332,7 +332,7 @@ def state_value(
         """
         V = self._V
         exploration = self._exploration
-        exploration_mode = self._exploration.mode
+        exploration_mode = exploration.mode
         na = V.na
         if deterministic or exploration_mode is None or not exploration.can_explore():
             pert = None
diff --git a/src/mpcrl/agents/lstd_dpg.py b/src/mpcrl/agents/lstd_dpg.py
index 82c7a65..1680f26 100644
--- a/src/mpcrl/agents/lstd_dpg.py
+++ b/src/mpcrl/agents/lstd_dpg.py
@@ -105,8 +105,8 @@ def __init__(
             their bounds and values. This dict is complementary with `fixed_parameters`,
             which contains the MPC parameters that are not learnt by the agent.
         exploration : ExplorationStrategy, optional
-            Exploration strategy for inducing exploration in the MPC policy (it is
-            mandatory to explore in DPG).
+            Exploration strategy for inducing exploration in the online MPC policy (it
+            is mandatory to explore in DPG).
         fixed_parameters : dict[str, array_like] or collection of, optional
             A dict (or collection of dict, in case of `csnlp.MultistartNlp`) whose keys
             are the names of the MPC parameters and the values are their corresponding
diff --git a/src/mpcrl/agents/lstd_q_learning.py b/src/mpcrl/agents/lstd_q_learning.py
index 105f2f9..c6435bc 100644
--- a/src/mpcrl/agents/lstd_q_learning.py
+++ b/src/mpcrl/agents/lstd_q_learning.py
@@ -100,8 +100,10 @@ def __init__(
             values. Use this to specify fixed parameters, that is, non-learnable. If
             `None`, then no fixed parameter is assumed.
         exploration : ExplorationStrategy, optional
-            Exploration strategy for inducing exploration in the MPC policy. By default
-            `None`, in which case `NoExploration` is used in the fixed-MPC agent.
+            Exploration strategy for inducing exploration in the online MPC policy. By
+            default `None`, in which case `NoExploration` is used. Should not be set
+            when offpolicy learning, as the exploration should be taken care in the
+            offpolicy data generation.
         experience : int or ExperienceReplay, optional
             The container for experience replay memory. If `None` is passed, then a
             memory with length 1 is created, i.e., it keeps only the latest memory
@@ -252,7 +254,7 @@ def _init_sensitivity(
         x_lam_p = cs.vertcat(nlp.primal_dual, nlp.p)
 
         # compute first order sensitivity
-        snlp = NlpSensitivity(self._Q.nlp, theta)
+        snlp = NlpSensitivity(nlp, theta)
         gradient = snlp.jacobians["L-p"]  # exact gradient, i.e., dQ/dtheta
 
         if hessian_type == "none":
diff --git a/src/mpcrl/core/exploration.py b/src/mpcrl/core/exploration.py
index 07698ab..2a4e4f7 100644
--- a/src/mpcrl/core/exploration.py
+++ b/src/mpcrl/core/exploration.py
@@ -88,6 +88,9 @@ def __repr__(self) -> str:
 class NoExploration(ExplorationStrategy):
     """Strategy where no exploration is allowed at any time or, in other words, the
     policy is always deterministic (only based on the current state, and not perturbed).
+
+    This is a special kind of `ExplorationStrategy`, the only one without any
+    `hook` and `mode`.
     """
 
     def __init__(self) -> None: