diff --git a/algorithms/sb3/callbacks.py b/algorithms/sb3/callbacks.py
old mode 100644
new mode 100755
index fdc1ad13..46f4d91e
--- a/algorithms/sb3/callbacks.py
+++ b/algorithms/sb3/callbacks.py
@@ -178,7 +178,7 @@ def _create_and_log_video(
         policy = self.model
         base_env = self.locals["env"]._env
         action_tensor = torch.zeros(
-            (base_env.num_worlds, base_env.max_agent_count)
+            (base_env.num_worlds, base_env.max_agent_count, 3) # todo: fix the dim
         )
 
         obs = base_env.reset()
diff --git a/algorithms/sb3/ppo/ippo.py b/algorithms/sb3/ppo/ippo.py
old mode 100644
new mode 100755
index cb06ca3c..323a8a84
--- a/algorithms/sb3/ppo/ippo.py
+++ b/algorithms/sb3/ppo/ippo.py
@@ -82,9 +82,9 @@ def collect_rollouts(
 
         while n_steps < n_rollout_steps:
             if (
-                self.use_sde
-                and self.sde_sample_freq > 0
-                and n_steps % self.sde_sample_freq == 0
+                    self.use_sde
+                    and self.sde_sample_freq > 0
+                    and n_steps % self.sde_sample_freq == 0
             ):
                 # Sample a new noise matrix
                 self.policy.reset_noise(env.num_envs)
@@ -95,21 +95,21 @@ def collect_rollouts(
                 # EDIT_1: Mask out invalid observations (NaN axes and/or dead agents)
                 # Create dummy actions, values and log_probs (NaN)
                 actions = torch.full(
-                    fill_value=float("nan"), size=(self.n_envs,)
+                    fill_value=float("nan"), size=(self.n_envs, 3) # todo: should change based on action space
                 ).to(self.device)
                 log_probs = torch.full(
                     fill_value=float("nan"),
-                    size=(self.n_envs,),
+                    size=(self.n_envs, ),
                     dtype=torch.float32,
                 ).to(self.device)
                 values = (
                     torch.full(
                         fill_value=float("nan"),
-                        size=(self.n_envs,),
+                        size=(self.n_envs, ),
                         dtype=torch.float32,
                     )
-                    .unsqueeze(dim=1)
-                    .to(self.device)
+                        .unsqueeze(dim=1)
+                        .to(self.device)
                 )
 
                 # Get indices of alive agent ids
@@ -131,10 +131,9 @@ def collect_rollouts(
                     obs_tensor_alive
                 )
                 nn_fps = actions_tmp.shape[0] / (
-                    time.perf_counter() - time_actions
+                        time.perf_counter() - time_actions
                 )
                 self.logger.record("rollout/nn_fps", nn_fps)
-
                 # Predict actions, vals and log_probs given obs
                 (
                     actions[alive_agent_mask.squeeze(dim=1)],
diff --git a/algorithms/sb3/rollout_buffer.py b/algorithms/sb3/rollout_buffer.py
old mode 100644
new mode 100755
index 5d4b8101..8f8c903d
--- a/algorithms/sb3/rollout_buffer.py
+++ b/algorithms/sb3/rollout_buffer.py
@@ -176,6 +176,21 @@ def compute_returns_and_advantage(
             self.advantages
         ).any(), "Advantages arr contains NaN values: Check GAE computation"
 
+    # def swap_and_flatten(self, arr: np.ndarray) -> np.ndarray:
+    #     """
+    #     Swap and then flatten axes 0 (buffer_size) and 1 (n_envs)
+    #     to convert shape from [n_steps, n_envs, ...] (when ... is the shape of the features)
+    #     to [n_steps * n_envs, ...] (which maintain the order)
+    #
+    #     :param arr:
+    #     :return:
+    #     """
+    #     shape = arr.shape
+    #     print(shape)
+    #     if len(shape) < 3:
+    #         shape = (*shape, 1)
+    #     return arr.swapaxes(0, 1).reshape(shape[0] * shape[1], *shape[2:])
+
     def get(
         self, batch_size: Optional[int] = None
     ) -> Generator[RolloutBufferSamples, None, None]:
@@ -200,7 +215,7 @@ def get(
             # Flatten data
             # EDIT_5: And mask out invalid samples
             for tensor in _tensor_names:
-                if tensor == "observations":
+                if tensor in ["observations", "actions"]:
                     self.__dict__[tensor] = self.swap_and_flatten(
                         self.__dict__[tensor]
                     )[self.valid_samples_mask.flatten(), :]
@@ -212,7 +227,6 @@ def get(
                 assert not torch.isnan(
                     self.__dict__[tensor]
                 ).any(), f"{tensor} tensor contains NaN values; something went wrong"
-
             self.generator_ready = True
 
         # EDIT_6: Compute total number of samples and create indices
diff --git a/baselines/ippo/config.py b/baselines/ippo/config.py
old mode 100644
new mode 100755
index 1a79de41..f237c552
--- a/baselines/ippo/config.py
+++ b/baselines/ippo/config.py
@@ -10,11 +10,10 @@ class ExperimentConfig:
     """Configurations for experiments."""
 
     # DATASET
-    data_dir: str = "data/processed/examples"
+    data_dir: str = "/data/formatted_json_v2_no_tl_train/" #todo: to be changed
 
     # NUM PARALLEL ENVIRONMENTS & DEVICE
-    num_worlds: int = 50  # Number of parallel environmentss
-
+    num_worlds: int = 1 # Number of parallel environments
     # How to select scenes from the dataset
     selection_discipline = SelectionDiscipline.K_UNIQUE_N  # K_UNIQUE_N / PAD_N
     k_unique_scenes: int = 3
@@ -31,7 +30,7 @@ class ExperimentConfig:
     render: bool = True
     render_mode: str = "rgb_array"
     render_freq: int = 50  # Render every k rollouts
-    render_n_worlds: int = 3  # Number of worlds to render
+    render_n_worlds: int = 1  # Number of worlds to render
 
     # TRACK THE TIME IT TAKES TO GET TO 95% GOAL RATE
     track_time_to_solve: bool = False
diff --git a/baselines/ippo/run_sb3_ppo.py b/baselines/ippo/run_sb3_ppo.py
old mode 100644
new mode 100755
index 84eea959..93a31beb
--- a/baselines/ippo/run_sb3_ppo.py
+++ b/baselines/ippo/run_sb3_ppo.py
@@ -34,8 +34,23 @@ def func(progress_remaining: float) -> float:
     return func
 
 
-def train(env_config: EnvConfig, exp_config: ExperimentConfig, scene_config: SceneConfig, action_type: str = "discrete"):
+def train(exp_config: ExperimentConfig, scene_config: SceneConfig, action_type: str = "discrete"):
     """Run PPO training with stable-baselines3."""
+
+    # CONFIG
+    env_config = EnvConfig(
+        dynamics_model="delta_local",
+        dx=torch.round(
+            torch.linspace(-6.0, 6.0, 20), decimals=3
+        ),
+        dy=torch.round(
+            torch.linspace(-6.0, 6.0, 20), decimals=3
+        ),
+        dyaw=torch.round(
+            torch.linspace(-np.pi, np.pi, 20), decimals=3
+        ),
+    )
+
     # MAKE SB3-COMPATIBLE ENVIRONMENT
     env = SB3MultiAgentEnv(
         config=env_config,
@@ -43,6 +58,7 @@ def train(env_config: EnvConfig, exp_config: ExperimentConfig, scene_config: Sce
         # Control up to all agents in the scene
         max_cont_agents=env_config.max_num_agents_in_scene,
         device=exp_config.device,
+        action_type=action_type
     )
 
     # SET MINIBATCH SIZE BASED ON ROLLOUT LENGTH
@@ -72,6 +88,7 @@ def train(env_config: EnvConfig, exp_config: ExperimentConfig, scene_config: Sce
     custom_callback = MultiAgentCallback(
         config=exp_config,
         wandb_run=run if run_id is not None else None,
+        # wandb_run=None,
     )
 
     # INITIALIZE IPPO
@@ -104,12 +121,11 @@ def train(env_config: EnvConfig, exp_config: ExperimentConfig, scene_config: Sce
         callback=custom_callback,
     )
 
-    run.finish()
+    # run.finish()
     env.close()
 
 
 if __name__ == "__main__":
-
     exp_config = pyrallis.parse(config_class=ExperimentConfig)
     
     env_config = EnvConfig(
@@ -132,4 +148,4 @@ def train(env_config: EnvConfig, exp_config: ExperimentConfig, scene_config: Sce
         k_unique_scenes=exp_config.k_unique_scenes,
     )
 
-    train(env_config, exp_config, scene_config, action_type="discrete")
+    train(exp_config, scene_config, action_type="multi_discrete")
diff --git a/pygpudrive/env/env_torch.py b/pygpudrive/env/env_torch.py
index 3c295bcc..db0906fd 100755
--- a/pygpudrive/env/env_torch.py
+++ b/pygpudrive/env/env_torch.py
@@ -37,7 +37,6 @@ def __init__(
             
         # Initialize simulator with parameters
         self.sim = self._initialize_simulator(params, scene_config)
-        
         # Controlled agents setup
         self.cont_agent_mask = self.get_controlled_agents_mask()
         self.max_agent_count = self.cont_agent_mask.shape[1]
diff --git a/pygpudrive/env/wrappers/sb3_wrapper.py b/pygpudrive/env/wrappers/sb3_wrapper.py
index 31f5dbb3..f1975ece 100755
--- a/pygpudrive/env/wrappers/sb3_wrapper.py
+++ b/pygpudrive/env/wrappers/sb3_wrapper.py
@@ -31,6 +31,7 @@ def __init__(
         scene_config,
         max_cont_agents,
         device,
+        action_type,
         render_mode="rgb_array",
     ):  
         kwargs={
@@ -47,7 +48,8 @@ def __init__(
         self.num_envs = self._env.cont_agent_mask.sum().item()
         self.device = device
         self.controlled_agent_mask = self._env.cont_agent_mask.clone()
-        self.action_space = gym.spaces.Discrete(self._env.action_space.n)
+        self.action_space = self._env.action_space
+        print(f'wrapper action space {self.action_space} {action_type}')
         self.observation_space = gym.spaces.Box(
             -np.inf, np.inf, self._env.observation_space.shape, np.float32
         )
@@ -57,9 +59,7 @@ def __init__(
         self.agent_step = torch.zeros(
             (self.num_worlds, self.max_agent_count)
         ).to(self.device)
-        self.actions_tensor = torch.zeros(
-            (self.num_worlds, self.max_agent_count)
-        ).to(self.device)
+        self._set_action_tensor(action_type, config.dynamics_model)
         # Storage: Fill buffer with nan values
         self.buf_rews = torch.full(
             (self.num_worlds, self.max_agent_count), fill_value=float("nan")
@@ -73,11 +73,20 @@ def __init__(
         ).to(self.device)
 
         self.num_episodes = 0
+        self.info_dict = {
+            "off_road": 0,
+            "veh_collisions": 0,
+            "non_veh_collision": 0,
+            "goal_achieved": 0,
+        }
 
     def _reset_seeds(self) -> None:
         """Reset all environments' seeds."""
         self._seeds = None
 
+    def _set_action_tensor(self, action_type, dynamics_model):
+        pass
+    
     def reset(self, world_idx=None, seed=None):
         """Reset environment and return initial observations.