diff --git a/examples/q_learning.py b/examples/q_learning.py index dbe983d..ac496ec 100644 --- a/examples/q_learning.py +++ b/examples/q_learning.py @@ -15,6 +15,7 @@ import numpy.typing as npt from csnlp import Nlp from csnlp.wrappers import Mpc +from gymnasium.spaces import Box from gymnasium.wrappers import TimeLimit from mpcrl import LearnableParameter, LearnableParametersDict, LstdQLearningAgent @@ -37,6 +38,7 @@ class LtiSystem(gym.Env[npt.NDArray[np.floating], float]): a_bnd = (-1, 1) # bounds of control input w = np.asarray([[1e2], [1e2]]) # penalty weight for bound violations e_bnd = (-1e-1, 0) # uniform noise bounds + action_space = Box(*a_bnd, (nu,), np.float64) def reset( self, diff --git a/src/mpcrl/agents/common/agent.py b/src/mpcrl/agents/common/agent.py index 846fb73..100d081 100644 --- a/src/mpcrl/agents/common/agent.py +++ b/src/mpcrl/agents/common/agent.py @@ -75,8 +75,8 @@ def __init__( values. Use this to specify fixed parameters, that is, non-learnable. If `None`, then no fixed parameter is assumed. exploration : ExplorationStrategy, optional - Exploration strategy for inducing exploration in the MPC policy. By default - `None`, in which case `NoExploration` is used. + Exploration strategy for inducing exploration in the online MPC policy. By + default `None`, in which case `NoExploration` is used. warmstart: "last" or "last-successful" or WarmStartStrategy, optional The warmstart strategy for the MPC's NLP. If `last-successful`, the last successful solution is used to warm start the solver for the next iteration. @@ -332,7 +332,7 @@ def state_value( """ V = self._V exploration = self._exploration - exploration_mode = self._exploration.mode + exploration_mode = exploration.mode na = V.na if deterministic or exploration_mode is None or not exploration.can_explore(): pert = None diff --git a/src/mpcrl/agents/lstd_dpg.py b/src/mpcrl/agents/lstd_dpg.py index 82c7a65..1680f26 100644 --- a/src/mpcrl/agents/lstd_dpg.py +++ b/src/mpcrl/agents/lstd_dpg.py @@ -105,8 +105,8 @@ def __init__( their bounds and values. This dict is complementary with `fixed_parameters`, which contains the MPC parameters that are not learnt by the agent. exploration : ExplorationStrategy, optional - Exploration strategy for inducing exploration in the MPC policy (it is - mandatory to explore in DPG). + Exploration strategy for inducing exploration in the online MPC policy (it + is mandatory to explore in DPG). fixed_parameters : dict[str, array_like] or collection of, optional A dict (or collection of dict, in case of `csnlp.MultistartNlp`) whose keys are the names of the MPC parameters and the values are their corresponding diff --git a/src/mpcrl/agents/lstd_q_learning.py b/src/mpcrl/agents/lstd_q_learning.py index 105f2f9..c6435bc 100644 --- a/src/mpcrl/agents/lstd_q_learning.py +++ b/src/mpcrl/agents/lstd_q_learning.py @@ -100,8 +100,10 @@ def __init__( values. Use this to specify fixed parameters, that is, non-learnable. If `None`, then no fixed parameter is assumed. exploration : ExplorationStrategy, optional - Exploration strategy for inducing exploration in the MPC policy. By default - `None`, in which case `NoExploration` is used in the fixed-MPC agent. + Exploration strategy for inducing exploration in the online MPC policy. By + default `None`, in which case `NoExploration` is used. Should not be set + when offpolicy learning, as the exploration should be taken care in the + offpolicy data generation. experience : int or ExperienceReplay, optional The container for experience replay memory. If `None` is passed, then a memory with length 1 is created, i.e., it keeps only the latest memory @@ -252,7 +254,7 @@ def _init_sensitivity( x_lam_p = cs.vertcat(nlp.primal_dual, nlp.p) # compute first order sensitivity - snlp = NlpSensitivity(self._Q.nlp, theta) + snlp = NlpSensitivity(nlp, theta) gradient = snlp.jacobians["L-p"] # exact gradient, i.e., dQ/dtheta if hessian_type == "none": diff --git a/src/mpcrl/core/exploration.py b/src/mpcrl/core/exploration.py index 07698ab..2a4e4f7 100644 --- a/src/mpcrl/core/exploration.py +++ b/src/mpcrl/core/exploration.py @@ -88,6 +88,9 @@ def __repr__(self) -> str: class NoExploration(ExplorationStrategy): """Strategy where no exploration is allowed at any time or, in other words, the policy is always deterministic (only based on the current state, and not perturbed). + + This is a special kind of `ExplorationStrategy`, the only one without any + `hook` and `mode`. """ def __init__(self) -> None: