Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Robust PLR #29

Open
wants to merge 30 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2b7e080
minor update to accept _venv
AmeenUrRehman Mar 31, 2024
d098b40
updated the task_sampler to accept evaluation task ,robust_plr and en…
AmeenUrRehman Apr 7, 2024
3b3cea1
revert the changes for seed function
AmeenUrRehman Apr 7, 2024
39aa1bc
fixs the issues and update the functions
AmeenUrRehman Apr 17, 2024
992bdab
update the plr_wrapper to accept the robust_plr option
AmeenUrRehman Apr 17, 2024
d92f64c
clean-up code and initialize the action_value_fn and callables
AmeenUrRehman Apr 18, 2024
d24434b
optimize the sample_fn and update tp accept the action_value_fn
AmeenUrRehman Apr 18, 2024
03b6b77
add the robust_plr option for existence plr
AmeenUrRehman Apr 21, 2024
cda5ddb
clean-up code for plr_wrapper and fixes minor issues
AmeenUrRehman Apr 21, 2024
908688d
optimize code based on the suggestions
AmeenUrRehman Apr 21, 2024
8364ff9
add user defined get_action_value func
AmeenUrRehman Apr 21, 2024
cb71cb8
fixes the eval_envs parameters
AmeenUrRehman Apr 21, 2024
75697e5
updating the task_sampler to fix issues
AmeenUrRehman Apr 22, 2024
a6e2b31
fix errors and optimize codes
AmeenUrRehman Apr 24, 2024
c6b7ade
added other parameters for Task Sampler initialization
AmeenUrRehman Apr 25, 2024
16fd2dc
initialisation for get_value_fn
AmeenUrRehman Apr 25, 2024
91d7060
added rollout class for task_sampler and fixed the GAE.
AmeenUrRehman Apr 25, 2024
e40c3ad
updated the task_Sampler to fix errors.
AmeenUrRehman Apr 26, 2024
4a1d9dc
Fix Robust PLR
RyanNavillus Apr 28, 2024
adbf1c5
Merge branch 'main' of https://github.com/RyanNavillus/Syllabus into …
RyanNavillus Apr 28, 2024
2ed4865
Merge branch 'main' of https://github.com/RyanNavillus/Syllabus into …
RyanNavillus Apr 28, 2024
b3afe49
Merge branch 'main' of https://github.com/RyanNavillus/Syllabus into …
RyanNavillus Apr 28, 2024
85c7504
Fix procgen script for robust plr
RyanNavillus Apr 28, 2024
2b281e6
pulled everything and continue on robust_plr
AmeenUrRehman Apr 30, 2024
1cc51c2
Add Storage file
RyanNavillus May 2, 2024
be3753d
Merge branch 'robust_plr' of github.com:AmeenUrRehman/Syllabus into r…
RyanNavillus May 2, 2024
81fa16e
updated the curricula to accept robust_plr
AmeenUrRehman May 3, 2024
2ea5979
fixes errors, minor changes and updated the task_sampler
AmeenUrRehman May 4, 2024
31c63fe
minor cleanup
AmeenUrRehman May 8, 2024
b96686d
minor cleanup final
AmeenUrRehman May 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion syllabus/curricula/plr/central_plr_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def __init__(
self._gae_lambda = gae_lambda
self._supress_usage_warnings = suppress_usage_warnings
self._task2index = {task: i for i, task in enumerate(self.tasks)}
self._task_sampler = TaskSampler(self.tasks, action_space=action_space, **task_sampler_kwargs_dict)
self._task_sampler = TaskSampler(self.tasks, task_space=task_space, action_space=action_space, **task_sampler_kwargs_dict)
self._rollouts = RolloutStorage(
self._num_steps,
self._num_processes,
Expand Down
15 changes: 14 additions & 1 deletion syllabus/curricula/plr/plr_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ class PrioritizedLevelReplay(Curriculum):
gamma (float): The discount factor used to compute returns
gae_lambda (float): The GAE lambda value.
suppress_usage_warnings (bool): Whether to suppress warnings about improper usage.
robust_plr (bool): Option to use RobustPLR.
eval_envs: Evaluation environments for RobustPLR.
action_value_fn (callable): A function that takes an observation as input and returns an action and value.
**curriculum_kwargs: Keyword arguments to pass to the curriculum.
"""
REQUIRES_STEP_UPDATES = True
Expand All @@ -170,6 +173,9 @@ def __init__(
suppress_usage_warnings=False,
get_value=null,
get_action_log_dist=null,
robust_plr: bool = False, # Option to use RobustPLR
eval_envs = None,
action_value_fn = None,
**curriculum_kwargs,
):
# Preprocess curriculum intialization args
Expand All @@ -186,15 +192,22 @@ def __init__(
task_sampler_kwargs_dict["num_actors"] = num_processes
super().__init__(task_space, *curriculum_args, **curriculum_kwargs)

if robust_plr and eval_envs is None:
raise UsageError("RobustPLR requires evaluation environments to be provided.")

self._num_steps = num_steps # Number of steps stored in rollouts and used to update task sampler
self._num_processes = num_processes # Number of parallel environments
self._gamma = gamma
self._gae_lambda = gae_lambda
self._supress_usage_warnings = suppress_usage_warnings
self._get_action_log_dist = get_action_log_dist
self._task2index = {task: i for i, task in enumerate(self.tasks)}
self._robust_plr = robust_plr
self._eval_envs = eval_envs
self.action_value_fn = action_value_fn

self._task_sampler = TaskSampler(self.tasks, task_space=task_space, action_space=action_space, robust_plr=robust_plr, eval_envs=eval_envs, action_value_fn = action_value_fn, **task_sampler_kwargs_dict)

self._task_sampler = TaskSampler(self.tasks, action_space=action_space, **task_sampler_kwargs_dict)
self._rollouts = RolloutStorage(
self._num_steps,
self._num_processes,
Expand Down
69 changes: 69 additions & 0 deletions syllabus/curricula/plr/storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import gymnasium as gym
import torch

class RolloutStorage(object):
def __init__(
self,
num_steps: int,
num_processes: int,
requires_value_buffers: bool,
action_space: gym.Space = None,
):
self._requires_value_buffers = requires_value_buffers
self.tasks = torch.zeros(num_steps, num_processes, 1, dtype=torch.int)
self.masks = torch.ones(num_steps + 1, num_processes, 1)

if requires_value_buffers:
self.returns = torch.zeros(num_steps + 1, num_processes, 1)
self.rewards = torch.zeros(num_steps, num_processes, 1)
self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
else:
if action_space is None:
raise ValueError(
"Action space must be provided to PLR for strategies 'policy_entropy', 'least_confidence', 'min_margin'"
)
self.action_log_dist = torch.zeros(num_steps, num_processes, action_space.n)

self.num_steps = num_steps
self.step = 0

def to(self, device):
self.masks = self.masks.to(device)
self.tasks = self.tasks.to(device)
if self._requires_value_buffers:
self.rewards = self.rewards.to(device)
self.value_preds = self.value_preds.to(device)
self.returns = self.returns.to(device)
else:
self.action_log_dist = self.action_log_dist.to(device)

def insert(self, masks, action_log_dist=None, value_preds=None, rewards=None, tasks=None):
if self._requires_value_buffers:
assert (value_preds is not None and rewards is not None), "Selected strategy requires value_preds and rewards"
if len(rewards.shape) == 3:
rewards = rewards.squeeze(2)
self.value_preds[self.step].copy_(torch.as_tensor(value_preds))
self.rewards[self.step].copy_(torch.as_tensor(rewards))
self.masks[self.step + 1].copy_(torch.as_tensor(masks))
else:
self.action_log_dist[self.step].copy_(action_log_dist)
if tasks is not None:
# assert isinstance(tasks[0], (int, torch.int32)), "Provided task must be an integer"
self.tasks[self.step].copy_(torch.as_tensor(tasks))
self.step = (self.step + 1) % self.num_steps

def after_update(self):
self.masks[0].copy_(self.masks[-1])

def compute_returns(self, next_value, gamma, gae_lambda):
assert self._requires_value_buffers, "Selected strategy does not use compute_rewards."
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.size(0))):
delta = (
self.rewards[step]
+ gamma * self.value_preds[step + 1] * self.masks[step + 1]
- self.value_preds[step]
)
gae = delta + gamma * gae_lambda * self.masks[step + 1] * gae
self.returns[step] = gae + self.value_preds[step]
Loading