-
Hey @Toni-SM, thanks for this fantastic library. Example from gymnasium documentation. import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()
for _ in range(1000):
action = env.action_space.sample() # agent policy that uses the observation and info
observation, reward, terminated, truncated, info = env.step(action)
if terminated or truncated:
observation, info = env.reset()
env.close() I am using skrl version 1.2.0 import torch
import torch.nn as nn
# import the skrl components to build the RL system
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.envs.wrappers.torch import wrap_env
from skrl.memories.torch import RandomMemory
from skrl.models.torch import DeterministicMixin, GaussianMixin, Model
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.trainers.torch import SequentialTrainer
from skrl.utils import set_seed
# seed for reproducibility
set_seed() # e.g. `set_seed(42)` for fixed seed
# define models (stochastic and deterministic models) using mixins
class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.ReLU(),
nn.Linear(64, 64),
nn.ReLU(),
nn.Linear(64, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
# Pendulum-v1 action_space is -2 to 2
return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}
class Value(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.ReLU(),
nn.Linear(64, 64),
nn.ReLU(),
nn.Linear(64, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
# load and wrap the gymnasium environment.
# note: the environment version may change depending on the gymnasium version
try:
env = gym.vector.make("Pendulum-v1", num_envs=4, asynchronous=False)
except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e:
env_id = [spec for spec in gym.envs.registry if spec.startswith("Pendulum-v")][0]
print("Pendulum-v1 not found. Trying {}".format(env_id))
env = gym.vector.make(env_id, num_envs=4, asynchronous=False)
env = wrap_env(env, wrapper="gymnasium")
device = env.device
# instantiate a memory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device)
# instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#models
models = {}
models["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True)
models["value"] = Value(env.observation_space, env.action_space, device)
# configure and instantiate the agent (visit its documentation to see all the options)
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#configuration-and-hyperparameters
cfg = PPO_DEFAULT_CONFIG.copy()
cfg["rollouts"] = 1024 # memory_size
cfg["learning_epochs"] = 10
cfg["mini_batches"] = 32
cfg["discount_factor"] = 0.9
cfg["lambda"] = 0.95
cfg["learning_rate"] = 1e-3
cfg["learning_rate_scheduler"] = KLAdaptiveRL
cfg["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg["grad_norm_clip"] = 0.5
cfg["ratio_clip"] = 0.2
cfg["value_clip"] = 0.2
cfg["clip_predicted_values"] = False
cfg["entropy_loss_scale"] = 0.0
cfg["value_loss_scale"] = 0.5
cfg["kl_threshold"] = 0
cfg["state_preprocessor"] = RunningStandardScaler
cfg["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg["value_preprocessor"] = RunningStandardScaler
cfg["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints (in timesteps)
cfg["experiment"]["write_interval"] = 500
cfg["experiment"]["checkpoint_interval"] = 5000
cfg["experiment"]["directory"] = "runs/torch/Pendulum"
agent = PPO(models=models,
memory=memory,
cfg=cfg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train() The evaluation code is below. class Policy(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.ReLU(),
nn.Linear(64, 64),
nn.ReLU(),
nn.Linear(64, self.num_actions))
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
# Pendulum-v1 action_space is -2 to 2
return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}
# load and wrap the gymnasium environment.
# note: the environment version may change depending on the gymnasium version
env = gym.make("Pendulum-v1", render_mode="human")
env = wrap_env(env, wrapper="gymnasium")
device = env.device
# # instantiate a memory as rollout buffer (any memory can be used for this)
# memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device)
# instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#models
models = {}
models["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True)
# models["value"] = Value(env.observation_space, env.action_space, device)
# configure and instantiate the agent (visit its documentation to see all the options)
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#configuration-and-hyperparameters
cfg = PPO_DEFAULT_CONFIG.copy()
cfg["random_timesteps"] = 0
cfg["experiment"]["checkpoint_interval"] = 0
agent = PPO(models=models,
memory=None,
cfg=cfg,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# configure and instantiate the RL trainer
cfg_trainer = SEQUENTIAL_TRAINER_DEFAULT_CONFIG.copy()
cfg_trainer = {"timesteps": 100000}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# # loading the agent
agent.load("./runs/torch/Pendulum/24-07-16_15-10-26-913723_PPO/checkpoints/best_agent.pt")
# #valuate the agent
trainer.eval() |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 2 replies
-
Hi @umfundii In #87 (comment) you can find a similar implementation to that shown in the gymnasium documentation. Please, not that having or not an input preprocessor during training is something that is necessary to take into account during manula stepping. |
Beta Was this translation helpful? Give feedback.
Hi @umfundii
In #87 (comment) you can find a similar implementation to that shown in the gymnasium documentation.
Please, not that having or not an input preprocessor during training is something that is necessary to take into account during manula stepping.