gymnasium rendering #173

umfundii · 2024-07-16T21:13:09Z

umfundii
Jul 16, 2024

Hey @Toni-SM, thanks for this fantastic library.
I have a few questions. I want to use gymnasium MuJoCo environments such as "'InvertedPendulum-v4" to benchmark the performance of SKRL. I used one of the example codes for PPO to train and evaluate the policy. In the documentation, you mentioned it is necessary to call the "gymnasium.make" function using 'render_mode="human"'. If I do so when I evaluate the policy, the evaluation becomes extremely slow. My naive question is, how do I render the already trained and evaluated policy in the gymnasium MuJoCo environments? Ideally, I want to do something like the example provided in the gymnasium documentation, where the action below will be from the policy I have trained through SKRL.

Example from gymnasium documentation.

import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

for _ in range(1000):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

I am using skrl version 1.2.0

import torch
import torch.nn as nn

# import the skrl components to build the RL system
from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
from skrl.envs.wrappers.torch import wrap_env
from skrl.memories.torch import RandomMemory
from skrl.models.torch import DeterministicMixin, GaussianMixin, Model
from skrl.resources.preprocessors.torch import RunningStandardScaler
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.trainers.torch import SequentialTrainer
from skrl.utils import set_seed


# seed for reproducibility
set_seed()  # e.g. `set_seed(42)` for fixed seed


# define models (stochastic and deterministic models) using mixins
class Policy(GaussianMixin, Model):
    def __init__(self, observation_space, action_space, device, clip_actions=False,
                 clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
        Model.__init__(self, observation_space, action_space, device)
        GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)

        self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, self.num_actions))
        self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))

    def compute(self, inputs, role):
        # Pendulum-v1 action_space is -2 to 2
        return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}

class Value(DeterministicMixin, Model):
    def __init__(self, observation_space, action_space, device, clip_actions=False):
        Model.__init__(self, observation_space, action_space, device)
        DeterministicMixin.__init__(self, clip_actions)

        self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 1))

    def compute(self, inputs, role):
        return self.net(inputs["states"]), {}


# load and wrap the gymnasium environment.
# note: the environment version may change depending on the gymnasium version
try:
    env = gym.vector.make("Pendulum-v1", num_envs=4, asynchronous=False)
except (gym.error.DeprecatedEnv, gym.error.VersionNotFound) as e:
    env_id = [spec for spec in gym.envs.registry if spec.startswith("Pendulum-v")][0]
    print("Pendulum-v1 not found. Trying {}".format(env_id))
    env = gym.vector.make(env_id, num_envs=4, asynchronous=False)
env = wrap_env(env, wrapper="gymnasium")

device = env.device


# instantiate a memory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device)


# instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#models
models = {}
models["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True)
models["value"] = Value(env.observation_space, env.action_space, device)


# configure and instantiate the agent (visit its documentation to see all the options)
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#configuration-and-hyperparameters
cfg = PPO_DEFAULT_CONFIG.copy()
cfg["rollouts"] = 1024  # memory_size
cfg["learning_epochs"] = 10
cfg["mini_batches"] = 32
cfg["discount_factor"] = 0.9
cfg["lambda"] = 0.95
cfg["learning_rate"] = 1e-3
cfg["learning_rate_scheduler"] = KLAdaptiveRL
cfg["learning_rate_scheduler_kwargs"] = {"kl_threshold": 0.008}
cfg["grad_norm_clip"] = 0.5
cfg["ratio_clip"] = 0.2
cfg["value_clip"] = 0.2
cfg["clip_predicted_values"] = False
cfg["entropy_loss_scale"] = 0.0
cfg["value_loss_scale"] = 0.5
cfg["kl_threshold"] = 0
cfg["state_preprocessor"] = RunningStandardScaler
cfg["state_preprocessor_kwargs"] = {"size": env.observation_space, "device": device}
cfg["value_preprocessor"] = RunningStandardScaler
cfg["value_preprocessor_kwargs"] = {"size": 1, "device": device}
# logging to TensorBoard and write checkpoints (in timesteps)
cfg["experiment"]["write_interval"] = 500
cfg["experiment"]["checkpoint_interval"] = 5000
cfg["experiment"]["directory"] = "runs/torch/Pendulum"

agent = PPO(models=models,
            memory=memory,
            cfg=cfg,
            observation_space=env.observation_space,
            action_space=env.action_space,
            device=device)


# configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)

# start training
trainer.train()

The evaluation code is below.

class Policy(GaussianMixin, Model):
    def __init__(self, observation_space, action_space, device, clip_actions=False,
                 clip_log_std=True, min_log_std=-20, max_log_std=2, reduction="sum"):
        Model.__init__(self, observation_space, action_space, device)
        GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std, reduction)

        self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, self.num_actions))
        self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))

    def compute(self, inputs, role):
        # Pendulum-v1 action_space is -2 to 2
        return 2 * torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}



# load and wrap the gymnasium environment.
# note: the environment version may change depending on the gymnasium version

env = gym.make("Pendulum-v1", render_mode="human")
env = wrap_env(env, wrapper="gymnasium")

device = env.device


# # instantiate a memory as rollout buffer (any memory can be used for this)
# memory = RandomMemory(memory_size=1024, num_envs=env.num_envs, device=device)


# instantiate the agent's models (function approximators).
# PPO requires 2 models, visit its documentation for more details
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#models
models = {}
models["policy"] = Policy(env.observation_space, env.action_space, device, clip_actions=True)
# models["value"] = Value(env.observation_space, env.action_space, device)


# configure and instantiate the agent (visit its documentation to see all the options)
# https://skrl.readthedocs.io/en/latest/api/agents/ppo.html#configuration-and-hyperparameters
cfg = PPO_DEFAULT_CONFIG.copy()
cfg["random_timesteps"] = 0  
cfg["experiment"]["checkpoint_interval"] = 0

agent = PPO(models=models,
            memory=None,
            cfg=cfg,
            observation_space=env.observation_space,
            action_space=env.action_space,
            device=device)


# configure and instantiate the RL trainer
cfg_trainer = SEQUENTIAL_TRAINER_DEFAULT_CONFIG.copy()
cfg_trainer = {"timesteps": 100000}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)

# # loading the agent
agent.load("./runs/torch/Pendulum/24-07-16_15-10-26-913723_PPO/checkpoints/best_agent.pt")
# #valuate the agent
trainer.eval()

Answered by Toni-SM

Jul 17, 2024

Hi @umfundii

In #87 (comment) you can find a similar implementation to that shown in the gymnasium documentation.

Please, not that having or not an input preprocessor during training is something that is necessary to take into account during manula stepping.

View full answer

Toni-SM · 2024-07-17T02:55:27Z

Toni-SM
Jul 17, 2024
Maintainer

Hi @umfundii

In #87 (comment) you can find a similar implementation to that shown in the gymnasium documentation.

Please, not that having or not an input preprocessor during training is something that is necessary to take into account during manula stepping.

2 replies

umfundii Jul 17, 2024
Author

@Toni-SM Thanks for the quick reply. Yes, #87 solved my problem (rendering issues).
Also, can you share your benchmark code for the gymnasium environment? If possible, I tried to locate it in the repo, but I was unsuccessful.
Thanks.

Toni-SM Aug 4, 2024
Maintainer

Hi @umfundii

Sorry for late reply.
Benchmark results (only reward) are in #32, but I need to update them 🙈

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

gymnasium rendering #173

{{title}}

Replies: 1 comment 2 replies

{{title}}

{{title}}

{{title}}

Select a reply

gymnasium rendering #173

umfundii Jul 16, 2024

Replies: 1 comment · 2 replies

Toni-SM Jul 17, 2024 Maintainer

umfundii Jul 17, 2024 Author

Toni-SM Aug 4, 2024 Maintainer

umfundii
Jul 16, 2024

Replies: 1 comment 2 replies

Toni-SM
Jul 17, 2024
Maintainer

umfundii Jul 17, 2024
Author

Toni-SM Aug 4, 2024
Maintainer