From bb69932654f3f62009f0a11667b559b8e9d925f3 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 16 Jun 2020 11:23:31 +0200
Subject: [PATCH] adding a fix for issue #14

---
 l2rpn_baselines/test/test_import.py     | 13 ++--
 l2rpn_baselines/test/test_train_eval.py | 86 ++++++++++++++++------
 l2rpn_baselines/utils/DeepQAgent.py     | 98 ++++++++++++++-----------
 l2rpn_baselines/utils/TrainingParam.py  |  2 +-
 l2rpn_baselines/utils/__init__.py       |  2 +
 l2rpn_baselines/utils/make_multi_env.py | 49 +++++++++++++
 6 files changed, 180 insertions(+), 70 deletions(-)
 create mode 100644 l2rpn_baselines/utils/make_multi_env.py

diff --git a/l2rpn_baselines/test/test_import.py b/l2rpn_baselines/test/test_import.py
index 3d5ee1d..b022062 100644
--- a/l2rpn_baselines/test/test_import.py
+++ b/l2rpn_baselines/test/test_import.py
@@ -70,14 +70,15 @@ def load_module(self):
         return "PandapowerOPFAgent"
 
 
-class TestPandapowerGeirina(TestImport, unittest.TestCase):
-    def load_module(self):
-        return "Geirina"
+# because it deactivates the eager mode
+# class TestPandapowerGeirina(TestImport, unittest.TestCase):
+#     def load_module(self):
+#         return "Geirina"
 
 
-class TestAsynchronousActorCritic(TestImport, unittest.TestCase):
-    def load_module(self):
-        return "AsynchronousActorCritic"
+# class TestAsynchronousActorCritic(TestImport, unittest.TestCase):
+#     def load_module(self):
+#         return "AsynchronousActorCritic"
 
 
 if __name__ == "__main__":
diff --git a/l2rpn_baselines/test/test_train_eval.py b/l2rpn_baselines/test/test_train_eval.py
index eec3f6c..98e5ce0 100644
--- a/l2rpn_baselines/test/test_train_eval.py
+++ b/l2rpn_baselines/test/test_train_eval.py
@@ -12,8 +12,11 @@
 import warnings
 import tempfile
 
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import grid2op
-from l2rpn_baselines.utils import TrainingParam, NNParam
+from grid2op.Environment import MultiEnvironment
+
+from l2rpn_baselines.utils import TrainingParam, NNParam, make_multi_env
 from l2rpn_baselines.DeepQSimple import train as train_dqn
 from l2rpn_baselines.DeepQSimple import evaluate as eval_dqn
 from l2rpn_baselines.DuelQSimple import train as train_d3qs
@@ -32,8 +35,6 @@
 from l2rpn_baselines.SliceRDQN import evaluate as eval_srqn
 from l2rpn_baselines.SliceRDQN import SliceRDQN_Config as srdqn_cfg
 
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
 
 class TestDeepQSimple(unittest.TestCase):
     def test_train_eval(self):
@@ -41,13 +42,12 @@ def test_train_eval(self):
         tp.buffer_size = 100
         tp.minibatch_size = 8
         tp.update_freq = 32
+        tp.min_observation = 32
         tmp_dir = tempfile.mkdtemp()
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore")
             env = grid2op.make("rte_case5_example", test=True)
-            li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                             "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                             "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
+            li_attr_obs_X = ["prod_p", "load_p", "rho"]
 
             # neural network architecture
             observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
@@ -85,6 +85,54 @@ def test_train_eval(self):
                                   verbose=False,
                                   save_gif=False)
 
+    def test_train_eval_multi(self):
+        tp = TrainingParam()
+        tp.buffer_size = 100
+        tp.minibatch_size = 8
+        tp.update_freq = 32
+        tp.min_observation = 32
+        tmp_dir = tempfile.mkdtemp()
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            env_init = grid2op.make("rte_case5_example", test=True)
+            env = make_multi_env(env_init, 2)
+
+            li_attr_obs_X = ["prod_p", "load_p", "rho"]
+
+            # neural network architecture
+            observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
+            sizes = [100, 50, 10]  # sizes of each hidden layers
+            kwargs_archi = {'observation_size': observation_size,
+                            'sizes': sizes,
+                            'activs': ["relu" for _ in sizes],  # all relu activation function
+                            "list_attr_obs": li_attr_obs_X}
+
+            kwargs_converters = {"all_actions": None,
+                                 "set_line_status": False,
+                                 "change_bus_vect": True,
+                                 "set_topo_vect": False
+                                 }
+            nm_ = "AnneOnymous"
+            train_dqn(env,
+                      name=nm_,
+                      iterations=100,
+                      save_path=tmp_dir,
+                      load_path=None,
+                      logs_dir=tmp_dir,
+                      training_param=tp,
+                      verbose=False,
+                      kwargs_converters=kwargs_converters,
+                      kwargs_archi=kwargs_archi)
+
+            baseline_2 = eval_dqn(env_init,
+                                  name=nm_,
+                                  load_path=tmp_dir,
+                                  logs_path=tmp_dir,
+                                  nb_episode=1,
+                                  nb_process=1,
+                                  max_steps=30,
+                                  verbose=False,
+                                  save_gif=False)
 
 class TestDuelQSimple(unittest.TestCase):
     def test_train_eval(self):
@@ -92,13 +140,12 @@ def test_train_eval(self):
         tp.buffer_size = 100
         tp.minibatch_size = 8
         tp.update_freq = 32
+        tp.min_observation = 32
         tmp_dir = tempfile.mkdtemp()
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore")
             env = grid2op.make("rte_case5_example", test=True)
-            li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                             "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                             "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
+            li_attr_obs_X = ["prod_p", "load_p", "rho"]
 
             # neural network architecture
             observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
@@ -143,13 +190,12 @@ def test_train_eval(self):
         tp.buffer_size = 100
         tp.minibatch_size = 8
         tp.update_freq = 32
+        tp.min_observation = 32
         tmp_dir = tempfile.mkdtemp()
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore")
             env = grid2op.make("rte_case5_example", test=True)
-            li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                             "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                             "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
+            li_attr_obs_X = ["prod_p", "load_p", "rho"]
 
             # neural network architecture
             observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
@@ -201,20 +247,15 @@ def test_train_eval(self):
         tp.buffer_size = 100
         tp.minibatch_size = 8
         tp.update_freq = 32
+        tp.min_observation = 32
         tmp_dir = tempfile.mkdtemp()
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore")
             env = grid2op.make("rte_case5_example", test=True)
-            li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                             "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                             "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
-
             # neural network architecture
-            li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                             "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                             "time_before_cooldown_sub", "timestep_overflow", "line_status", "rho"]
-            li_attr_obs_Tau = ["rho", "line_status"]
-            sizes = [800, 800, 800, 494, 494, 494]
+            li_attr_obs_X = ["prod_p", "load_p", "rho"]
+            li_attr_obs_Tau = ["line_status"]
+            sizes = [100, 50, 10]
 
             x_dim = NNParam.get_obs_size(env, li_attr_obs_X)
             tau_dims = [NNParam.get_obs_size(env, [el]) for el in li_attr_obs_Tau]
@@ -257,6 +298,7 @@ def test_train_eval(self):
                                    verbose=False,
                                    save_gif=False)
 
+
 class TestD3QN(unittest.TestCase):
     def test_train_eval(self):
         tmp_dir = tempfile.mkdtemp()
@@ -294,6 +336,7 @@ def test_train_eval(self):
 
             assert eval_res is not None
 
+
 class TestRDQN(unittest.TestCase):
     def test_train_eval(self):
         tmp_dir = tempfile.mkdtemp()
@@ -329,6 +372,7 @@ def test_train_eval(self):
 
             assert eval_res is not None
 
+
 class TestSRDQN(unittest.TestCase):
     def test_train_eval(self):
         tmp_dir = tempfile.mkdtemp()
diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/DeepQAgent.py
index 3fc553a..535fb81 100644
--- a/l2rpn_baselines/utils/DeepQAgent.py
+++ b/l2rpn_baselines/utils/DeepQAgent.py
@@ -11,6 +11,7 @@
 from tqdm import tqdm
 import tensorflow as tf
 
+import grid2op
 from grid2op.Exceptions import Grid2OpException
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
@@ -111,7 +112,6 @@ def __init__(self,
                  name="DeepQAgent",
                  store_action=True,
                  istraining=False,
-                 nb_env=1,
                  filter_action_fun=None,
                  verbose=False,
                  **kwargs_converters):
@@ -122,7 +122,7 @@ def __init__(self,
 
         # and now back to the origin implementation
         self.replay_buffer = None
-        self.__nb_env = nb_env
+        self.__nb_env = None
 
         self.deep_q = None
         self._training_param = None
@@ -306,8 +306,6 @@ def load(self, path):
             conv_path = os.path.join(tmp_me, "{}.npy".format(nm_attr))
             if os.path.exists(conv_path):
                 setattr(self, nm_attr, np.load(file=conv_path))
-            else:
-                raise RuntimeError("Impossible to find the data \"{}.npy\" at \"{}\"".format(nm_attr, tmp_me))
 
     def save(self, path):
         """
@@ -336,7 +334,9 @@ def save(self, path):
             # TODO save the "oversampling" part, and all the other info
             for nm_attr in ["_time_step_lived", "_nb_chosen", "_proba"]:
                 conv_path = os.path.join(tmp_me, "{}.npy".format(nm_attr))
-                np.save(arr=getattr(self, nm_attr), file=conv_path)
+                attr_ = getattr(self, nm_attr)
+                if attr_ is not None:
+                    np.save(arr=attr_, file=conv_path)
 
     def train(self,
               env,
@@ -404,6 +404,14 @@ def train(self,
         UPDATE_FREQ = training_param.update_tensorboard_freq  # update tensorboard every "UPDATE_FREQ" steps
         SAVING_NUM = training_param.save_model_each
 
+        if isinstance(env, grid2op.Environment.Environment):
+            self.__nb_env = 1
+        else:
+            import warnings
+            nb_env = env.nb_env
+            warnings.warn("Training using {} environments".format(nb_env))
+            self.__nb_env = nb_env
+
         self.init_obs_extraction(env)
 
         training_step = self._training_param.last_step
@@ -435,22 +443,24 @@ def train(self,
 
         # for non uniform random sampling of the scenarios
         th_size = None
-        if _CACHE_AVAILABLE_DEEPQAGENT:
-            if isinstance(env.chronics_handler.real_data, MultifolderWithCache):
-                th_size = env.chronics_handler.real_data.cache_size
-        if th_size is None:
-            th_size = len(env.chronics_handler.real_data.subpaths)
-
         self._prev_obs_num = 0
-        # number of time step lived per possible scenarios
-        if self._time_step_lived is None or self._time_step_lived.shape[0] != th_size:
-            self._time_step_lived = np.zeros(th_size, dtype=np.uint64)
-        # number of time a given scenario has been played
-        if self._nb_chosen is None or self._nb_chosen.shape[0] != th_size:
-            self._nb_chosen = np.zeros(th_size, dtype=np.uint)
-        # number of time a given scenario has been played
-        if self._proba is None or self._proba.shape[0] != th_size:
-            self._proba = np.ones(th_size, dtype=np.float64)
+        if self.__nb_env == 1:
+            # TODO make this available for multi env too
+            if _CACHE_AVAILABLE_DEEPQAGENT:
+                if isinstance(env.chronics_handler.real_data, MultifolderWithCache):
+                    th_size = env.chronics_handler.real_data.cache_size
+            if th_size is None:
+                th_size = len(env.chronics_handler.real_data.subpaths)
+
+            # number of time step lived per possible scenarios
+            if self._time_step_lived is None or self._time_step_lived.shape[0] != th_size:
+                self._time_step_lived = np.zeros(th_size, dtype=np.uint64)
+            # number of time a given scenario has been played
+            if self._nb_chosen is None or self._nb_chosen.shape[0] != th_size:
+                self._nb_chosen = np.zeros(th_size, dtype=np.uint)
+            # number of time a given scenario has been played
+            if self._proba is None or self._proba.shape[0] != th_size:
+                self._proba = np.ones(th_size, dtype=np.float64)
 
         self._prev_id = 0
         # this is for the "limit the episode length" depending on your previous success
@@ -485,6 +495,7 @@ def train(self,
                     temp_reward = np.array([temp_reward], dtype=np.float32)
                     temp_done = np.array([temp_done], dtype=np.bool)
                     info = [info]
+
                 new_state = self._convert_obs_train(temp_observation_obj)
                 self._updage_illegal_ambiguous(training_step, info)
                 done, reward, total_reward, alive_frame, epoch_num \
@@ -673,7 +684,8 @@ def _need_reset(self, env, observation_num, epoch_num, done, new_state):
 
             # update the number of time steps it has live
             ts_lived = observation_num - self._prev_obs_num
-            self._time_step_lived[self._prev_id] += ts_lived
+            if self._time_step_lived is not None:
+                self._time_step_lived[self._prev_id] += ts_lived
             self._prev_obs_num = observation_num
             if self._training_param.oversampling_rate is not None:
                 # proba = np.sqrt(1. / (self._time_step_lived +1))
@@ -694,7 +706,8 @@ def _need_reset(self, env, observation_num, epoch_num, done, new_state):
                 self._prev_id %= self._time_step_lived.shape[0]
 
             env.reset()
-            self._nb_chosen[self._prev_id] += 1
+            if self._nb_chosen is not None:
+                self._nb_chosen[self._prev_id] += 1
 
             # random fast forward between now and next week
             if self._training_param.random_sample_datetime_start is not None:
@@ -783,17 +796,17 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a
                 # print the top k scenarios the "hardest" (ie chosen the most number of times
                 if self.verbose:
                     top_k = 10
-                    array_ = np.argsort(self._nb_chosen)[-top_k:][::-1]
-                    print("hardest scenarios\n{}".format(array_))
-                    print("They have been chosen respectively\n{}".format(self._nb_chosen[array_]))
-                    # print("Associated proba are\n{}".format(self._proba[array_]))
-                    print("The number of timesteps played is\n{}".format(self._time_step_lived[array_]))
-                    print("avg (accross all scenarios) number of timsteps played {}"
-                          "".format(np.mean(self._time_step_lived)))
-                    print("Time alive: {}".format(self._time_step_lived[array_] / (self._nb_chosen[array_] + 1)))
-                    print("Avg time alive: {}".format(np.mean(self._time_step_lived / (self._nb_chosen + 1 ))))
-                    # print("avg (accross all scenarios) proba {}"
-                    #       "".format(np.mean(self._proba)))
+                    if self._nb_chosen is not None:
+                        array_ = np.argsort(self._nb_chosen)[-top_k:][::-1]
+                        print("hardest scenarios\n{}".format(array_))
+                        print("They have been chosen respectively\n{}".format(self._nb_chosen[array_]))
+                        # print("Associated proba are\n{}".format(self._proba[array_]))
+                        print("The number of timesteps played is\n{}".format(self._time_step_lived[array_]))
+                        print("avg (accross all scenarios) number of timsteps played {}"
+                              "".format(np.mean(self._time_step_lived)))
+                        print("Time alive: {}".format(self._time_step_lived[array_] / (self._nb_chosen[array_] + 1)))
+                        print("Avg time alive: {}".format(np.mean(self._time_step_lived / (self._nb_chosen + 1 ))))
+
             with self._tf_writer.as_default():
                 last_alive = epoch_alive[(epoch_num-1)]
                 last_reward = epoch_rewards[(epoch_num-1)]
@@ -885,12 +898,13 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a
                         self.nb_do_nothing = 0
                         self._nb_updated_act_tensorboard = 0
 
-
-                tf.summary.histogram(
-                    "timestep_lived", self._time_step_lived, step=step_tb, buckets=None,
-                    description="number of time steps lived for all scenarios"
-                )
-                tf.summary.histogram(
-                    "nb_chosen", self._nb_chosen, step=step_tb, buckets=None,
-                    description="number of times this scenarios has been played"
-                )
+                if self._time_step_lived is not None:
+                    tf.summary.histogram(
+                        "timestep_lived", self._time_step_lived, step=step_tb, buckets=None,
+                        description="number of time steps lived for all scenarios"
+                    )
+                if self._nb_chosen is not None:
+                    tf.summary.histogram(
+                        "nb_chosen", self._nb_chosen, step=step_tb, buckets=None,
+                        description="number of times this scenarios has been played"
+                    )
diff --git a/l2rpn_baselines/utils/TrainingParam.py b/l2rpn_baselines/utils/TrainingParam.py
index 8d9ff94..c186220 100644
--- a/l2rpn_baselines/utils/TrainingParam.py
+++ b/l2rpn_baselines/utils/TrainingParam.py
@@ -151,7 +151,7 @@ def __init__(self,
 
         self.buffer_size = buffer_size
         self.minibatch_size = minibatch_size
-        self.min_observation = min_observation  # 5000
+        self.min_observation = min_observation
         self._final_epsilon = float(final_epsilon)  # have on average 1 random action per day of approx 288 timesteps at the end (never kill completely the exploration)
         self._initial_epsilon = float(initial_epsilon)
         self.step_for_final_epsilon = float(step_for_final_epsilon)
diff --git a/l2rpn_baselines/utils/__init__.py b/l2rpn_baselines/utils/__init__.py
index c047252..747ebef 100644
--- a/l2rpn_baselines/utils/__init__.py
+++ b/l2rpn_baselines/utils/__init__.py
@@ -11,6 +11,7 @@
            "cli_train",
            "str2bool",
            "save_log_gif",
+           "make_multi_env",
            "zip_for_codalab",
            "train_generic",
            "TrainingParam",
@@ -26,6 +27,7 @@
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 from l2rpn_baselines.utils.zip_for_codalab import zip_for_codalab
 from l2rpn_baselines.utils.train_generic import train_generic
+from l2rpn_baselines.utils.make_multi_env import make_multi_env
 
 from l2rpn_baselines.utils.TrainingParam import TrainingParam
 from l2rpn_baselines.utils.NNParam import NNParam
diff --git a/l2rpn_baselines/utils/make_multi_env.py b/l2rpn_baselines/utils/make_multi_env.py
new file mode 100644
index 0000000..f157277
--- /dev/null
+++ b/l2rpn_baselines/utils/make_multi_env.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import warnings
+from grid2op.Environment import MultiEnvironment, Environment
+
+
+def make_multi_env(env_init, nb_env):
+    """
+    This function creates a multi environment compatible with what is expected in the baselines. In particular, it
+    adds the observation_space, the action_space and the reward_range attribute.
+
+    The way this function works is explained in the getting_started of grid2op.
+
+    Attributes
+    -----------
+    env_init: :class:`grid2op.Environment.Environment`
+        The environment to duplicates
+    nb_env: ``int``
+        The number of environment on with which you want to interact at the same time
+
+    Returns
+    -------
+    res: :class:`grid2op.Environment.MultiEnvironment` or :class:`grid2op.Environment.Environment`
+        A copy of the initial environment (if nb_env = 1) or a MultiEnvironment based on the initial environment
+        if nb_env >= 2.
+
+    """
+    res = None
+    nb_env = int(nb_env)
+
+    if nb_env <= 0:
+        raise RuntimeError("Impossible to create a negative number of environments")
+
+    if nb_env == 1:
+        warnings.warn("You asked to create 1 environment. We didn't use the MultiEnvironment for that. We instead "
+                      "created a copy of your initial environment.")
+        res = Environment(**env_init.get_kwargs())
+    else:
+        res = MultiEnvironment(nb_env, env_init)
+        res.observation_space = env_init.observation_space
+        res.action_space = env_init.action_space
+        res.reward_range = env_init.reward_range
+    return res
\ No newline at end of file