diff --git a/.gitmodules b/.gitmodules index 4e37359..6ade8ce 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,4 +6,4 @@ url = https://github.com/djmax008/GEIRINA_baseline [submodule "l2rpn_baselines/AsynchronousActorCritic"] path = l2rpn_baselines/AsynchronousActorCritic - url = https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git + url = https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git \ No newline at end of file diff --git a/docs/DoubleDuelingDQN.rst b/docs/DoubleDuelingDQN.rst new file mode 100644 index 0000000..df13a60 --- /dev/null +++ b/docs/DoubleDuelingDQN.rst @@ -0,0 +1,56 @@ +DoubleDuelingDQN: A example implementation of Double Duelling Deep Q Network +============================================================================ + +Description +----------- +This module serves as an concrete example on how to implement a D3QN baseline. +This baseline is of type Double Duelling Deep Q Network, as in Duelling Q Network and DoubleQ update. + +It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results. + + +Agent class +------------------------ +You can use this class with: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQN + from l2rpn_baselines.DoubleDuelingDQN import train + from l2rpn_baselines.DoubleDuelingDQN import evaluate + +.. automodule:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN + :members: + :autosummary: + +Configuration +------------------------ +Training a model requires tweaking many hyperparameters, these can be found in a specific class attributes: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQNConfig + + # Set hyperparameters before training + DoubleDuelingDQNConfig.LR = 1e-5 + DoubleDuelingDQNConfig.INITAL_EPSILON = 1.0 + DoubleDuelingDQNConfig.FINAL_EPSILON = 0.001 + DoubleDuelingDQNConfig.DECAY_EPSILON = 10000 + +.. automodule:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig + :members: + :undoc-members: + +Internal classes +------------------------ +The neural network model is defined in a separate class. +You may want to import it manually: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN + + +.. autoclass:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN.DoubleDuelingDQN_NN + :members: + :autosummary: diff --git a/docs/DoubleDuelingRDQN.rst b/docs/DoubleDuelingRDQN.rst new file mode 100644 index 0000000..c286e1f --- /dev/null +++ b/docs/DoubleDuelingRDQN.rst @@ -0,0 +1,54 @@ +DoubleDuelingRDQN: A example implementation of Recurrent DoubleQ Network +======================================================================== + +Description +----------- +This module serves as an concrete example on how to implement a recurrent D3QN baseline. +This baseline is of type Recurrent Double Duelling Deep Q Network, as in Duelling Q, DoubleQ update and recurrent neural network. + +It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results. + + +Agent class +------------------------ +You can use this class with: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingRDQN import DoubleDuelingRDQN + from l2rpn_baselines.DoubleDuelingRDQN import train + from l2rpn_baselines.DoubleDuelingRDQN import evaluate + +.. automodule:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN + :members: + :autosummary: + +Configuration +------------------------ +Training a model requires tweaking many hyperparameters, these can be found in a specific class attributes: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingRDQN import DoubleDuelingRDQNConfig + + # Set hyperparameters before training + DoubleDuelingRDQNConfig.LR = 1e-5 + DoubleDuelingRDQNConfig.TRACE_LENGTH = 12 + +.. automodule:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig + :members: + :undoc-members: + +Internal classes +------------------------ +The neural network model is defined in a separate class. +You may want to import it manually: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN import DoubleDuelingRDQN_NN + + +.. autoclass:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN.DoubleDuelingRDQN_NN + :members: + :autosummary: diff --git a/docs/index.rst b/docs/index.rst index ed44a1e..6492796 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -26,9 +26,11 @@ Baseline already Available utils DeepQSimple + DoubleDuelingDQN DuelQSimple SAC + More advanced baselines ------------------------ @@ -36,6 +38,7 @@ More advanced baselines :maxdepth: 2 DuelQLeapNet + DoubleDuelingRDQN Contributions diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index 59482df..316649c 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -14,43 +14,32 @@ from grid2op.Agent import AgentWithConverter from grid2op.Converter import IdToAct +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN from l2rpn_baselines.DoubleDuelingDQN.prioritized_replay_buffer import PrioritizedReplayBuffer -LR_DECAY_STEPS = 1024*32 -LR_DECAY_RATE = 0.95 -INITIAL_EPSILON = 0.99 -FINAL_EPSILON = 0.001 -DECAY_EPSILON = 1024*32 -DISCOUNT_FACTOR = 0.99 -PER_CAPACITY = 1024*64 -PER_ALPHA = 0.7 -PER_BETA = 0.5 -UPDATE_FREQ = 64 -UPDATE_TARGET_HARD_FREQ = 16 -UPDATE_TARGET_SOFT_TAU = -1 - - class DoubleDuelingDQN(AgentWithConverter): def __init__(self, observation_space, action_space, name=__name__, - num_frames=4, - is_training=False, - batch_size=32, - lr=1e-5): + is_training=False): # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) self.obs_space = observation_space + + # Filter + #print("Actions filtering...") + self.action_space.filter_action(self._filter_action) + #print("..Done") # Store constructor params self.name = name - self.num_frames = num_frames + self.num_frames = cfg.N_FRAMES self.is_training = is_training - self.batch_size = batch_size - self.lr = lr + self.batch_size = cfg.BATCH_SIZE + self.lr = cfg.LR # Declare required vars self.Qmain = None @@ -76,18 +65,34 @@ def __init__(self, self.observation_size, num_frames=self.num_frames, learning_rate=self.lr, - learning_rate_decay_steps=LR_DECAY_STEPS, - learning_rate_decay_rate=LR_DECAY_RATE) + learning_rate_decay_steps=cfg.LR_DECAY_STEPS, + learning_rate_decay_rate=cfg.LR_DECAY_RATE) # Setup training vars if needed if self.is_training: self._init_training() + def _filter_action(self, action): + MAX_ELEM = 2 + act_dict = action.impact_on_objects() + elem = 0 + elem += act_dict["force_line"]["reconnections"]["count"] + elem += act_dict["force_line"]["disconnections"]["count"] + elem += act_dict["switch_line"]["count"] + elem += len(act_dict["topology"]["bus_switch"]) + elem += len(act_dict["topology"]["assigned_bus"]) + elem += len(act_dict["topology"]["disconnect_bus"]) + elem += len(act_dict["redispatch"]["generators"]) + + if elem <= MAX_ELEM: + return True + return False + def _init_training(self): - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON self.frames2 = [] self.epoch_rewards = [] self.epoch_alive = [] - self.per_buffer = PrioritizedReplayBuffer(PER_CAPACITY, PER_ALPHA) + self.per_buffer = PrioritizedReplayBuffer(cfg.PER_CAPACITY, cfg.PER_ALPHA) self.Qtarget = DoubleDuelingDQN_NN(self.action_size, self.observation_size, num_frames = self.num_frames) @@ -115,32 +120,32 @@ def _save_next_frame(self, next_state): self.frames2.pop(0) def _adaptive_epsilon_decay(self, step): - ada_div = DECAY_EPSILON / 10.0 + ada_div = cfg.DECAY_EPSILON / 10.0 step_off = step + ada_div - ada_eps = INITIAL_EPSILON * -math.log10((step_off + 1) / (DECAY_EPSILON + ada_div)) - ada_eps_up_clip = min(INITIAL_EPSILON, ada_eps) - ada_eps_low_clip = max(FINAL_EPSILON, ada_eps_up_clip) + ada_eps = cfg.INITIAL_EPSILON * -math.log10((step_off + 1) / (cfg.DECAY_EPSILON + ada_div)) + ada_eps_up_clip = min(cfg.INITIAL_EPSILON, ada_eps) + ada_eps_low_clip = max(cfg.FINAL_EPSILON, ada_eps_up_clip) return ada_eps_low_clip def _save_hyperparameters(self, logpath, env, steps): r_instance = env.reward_helper.template_reward hp = { - "lr": self.lr, - "lr_decay_steps": LR_DECAY_STEPS, - "lr_decay_rate": LR_DECAY_RATE, - "batch_size": self.batch_size, - "stack_frames": self.num_frames, + "lr": cfg.LR, + "lr_decay_steps": cfg.LR_DECAY_STEPS, + "lr_decay_rate": cfg.LR_DECAY_RATE, + "batch_size": cfg.BATCH_SIZE, + "stack_frames": cfg.N_FRAMES, "iter": steps, - "e_start": INITIAL_EPSILON, - "e_end": FINAL_EPSILON, - "e_decay": DECAY_EPSILON, - "discount": DISCOUNT_FACTOR, - "per_alpha": PER_ALPHA, - "per_beta": PER_BETA, - "per_capacity": PER_CAPACITY, - "update_freq": UPDATE_FREQ, - "update_hard": UPDATE_TARGET_HARD_FREQ, - "update_soft": UPDATE_TARGET_SOFT_TAU, + "e_start": cfg.INITIAL_EPSILON, + "e_end": cfg.FINAL_EPSILON, + "e_decay": cfg.DECAY_EPSILON, + "discount": cfg.DISCOUNT_FACTOR, + "per_alpha": cfg.PER_ALPHA, + "per_beta": cfg.PER_BETA, + "per_capacity": cfg.PER_CAPACITY, + "update_freq": cfg.UPDATE_FREQ, + "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, + "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) @@ -202,7 +207,7 @@ def train(self, env, num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 self.done = True @@ -213,19 +218,14 @@ def train(self, env, modelpath = os.path.join(save_path, self.name + ".h5") self.tf_writer = tf.summary.create_file_writer(logpath, name=self.name) self._save_hyperparameters(save_path, env, num_steps) - + # Training loop while step < num_steps: # Init first time or new episode if self.done: new_obs = env.reset() # This shouldn't raise - # Random fast forward somewhere in the day - #ff_rand = np.random.randint(0, 12*24) - #env.fast_forward_chronics(ff_rand) - # Reset internal state - #new_obs = env.current_obs self.reset(new_obs) - if step % 1000 == 0: + if cfg.VERBOSE and step % 1000 == 0: print("Step [{}] -- Random [{}]".format(step, self.epsilon)) # Save current observation to stacking buffer @@ -248,7 +248,8 @@ def train(self, env, new_state = self.convert_obs(new_obs) if info["is_illegal"] or info["is_ambiguous"] or \ info["is_dispatching_illegal"] or info["is_illegal_reco"]: - print (a, info) + if cfg.VERBOSE: + print (a, info) # Save new observation to stacking buffer self._save_next_frame(new_state) @@ -267,24 +268,28 @@ def train(self, env, self.epsilon = self._adaptive_epsilon_decay(training_step) # Perform training at given frequency - if step % UPDATE_FREQ == 0 and len(self.per_buffer) >= self.batch_size: + if step % cfg.UPDATE_FREQ == 0 and \ + len(self.per_buffer) >= self.batch_size: # Perform training self._batch_train(training_step, step) - if UPDATE_TARGET_SOFT_TAU > 0.0: + if cfg.UPDATE_TARGET_SOFT_TAU > 0.0: + tau = cfg.UPDATE_TARGET_SOFT_TAU # Update target network towards primary network - self.Qmain.update_target_soft(self.Qtarget.model, tau=UPDATE_TARGET_SOFT_TAU) + self.Qmain.update_target_soft(self.Qtarget.model, tau) # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely - if UPDATE_TARGET_HARD_FREQ > 0 and step % (UPDATE_FREQ * UPDATE_TARGET_HARD_FREQ) == 0: + if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ + step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) - print("Survived [{}] steps".format(alive_steps)) - print("Total reward [{}]".format(total_reward)) + if cfg.VERBOSE: + print("Survived [{}] steps".format(alive_steps)) + print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: @@ -307,7 +312,7 @@ def _batch_train(self, training_step, step): """Trains network to fit given parameters""" # Sample from experience buffer - sample_batch = self.per_buffer.sample(self.batch_size, PER_BETA) + sample_batch = self.per_buffer.sample(self.batch_size, cfg.PER_BETA) s_batch = sample_batch[0] a_batch = sample_batch[1] r_batch = sample_batch[2] @@ -344,7 +349,7 @@ def _batch_train(self, training_step, step): doubleQ = Q2[i, np.argmax(Q1[i])] Q[i, a_batch[i]] = r_batch[i] if d_batch[i] == False: - Q[i, a_batch[i]] += DISCOUNT_FACTOR * doubleQ + Q[i, a_batch[i]] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train loss = self.Qmain.train_on_batch(input_t, Q, w_batch) @@ -356,7 +361,7 @@ def _batch_train(self, training_step, step): self.per_buffer.update_priorities(idx_batch, priorities) # Log some useful metrics every even updates - if step % (UPDATE_FREQ * 2) == 0: + if step % (cfg.UPDATE_FREQ * 2) == 0: with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) @@ -372,5 +377,5 @@ def _batch_train(self, training_step, step): tf.summary.scalar("mean_alive_100", mean_alive_100, step) tf.summary.scalar("loss", loss, step) tf.summary.scalar("lr", self.Qmain.train_lr, step) - - print("loss =", loss) + if cfg.VERBOSE: + print("loss =", loss) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py new file mode 100644 index 0000000..a343692 --- /dev/null +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py @@ -0,0 +1,45 @@ +import os +import json + +class DoubleDuelingDQNConfig(): + """ + DoubleDuelingDQN configurable hyperparameters + exposed as class attributes + """ + + LR_DECAY_STEPS = 1024*64 + LR_DECAY_RATE = 0.95 + INITIAL_EPSILON = 0.99 + FINAL_EPSILON = 0.001 + DECAY_EPSILON = 1024*64 + DISCOUNT_FACTOR = 0.98 + PER_CAPACITY = 1024*64 + PER_ALPHA = 0.7 + PER_BETA = 0.5 + UPDATE_FREQ = 28 + UPDATE_TARGET_HARD_FREQ = -1 + UPDATE_TARGET_SOFT_TAU = 1e-3 + N_FRAMES = 4 + BATCH_SIZE = 32 + LR = 1e-5 + VERBOSE = True + + @staticmethod + def from_json(json_in_path): + with open(json_in_path, 'r') as fp: + conf_json = json.load(fp) + + for k,v in conf_json.items(): + if hasattr(DoubleDuelingDQNConfig, k): + setattr(DoubleDuelingDQNConfig, k, v) + + @staticmethod + def to_json(json_out_path): + conf_json = {} + for attr in dir(DoubleDuelingDQNConfig): + if attr.startswith('__') or callable(attr): + continue + conf_json[attr] = getattr(DoubleDuelingDQNConfig, attr) + + with open(json_out_path, 'w+') as fp: + json.dump(fp, conf_json, indent=2) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py index 2d98343..685c5fe 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py @@ -36,32 +36,43 @@ def __init__(self, self.construct_q_network() def construct_q_network(self): - input_layer = tfk.Input(shape = (self.observation_size * self.num_frames,), name="input_obs") + input_shape = (self.observation_size * self.num_frames,) + input_layer = tfk.Input(shape = input_shape, name="input_obs") lay1 = tfkl.Dense(self.observation_size * 2, name="fc_1")(input_layer) lay1 = tfka.relu(lay1, alpha=0.01) #leaky_relu lay2 = tfkl.Dense(self.observation_size, name="fc_2")(lay1) lay2 = tfka.relu(lay2, alpha=0.01) #leaky_relu - lay3 = tfkl.Dense(self.action_size * 3, name="fc_3")(lay2) + lay3 = tfkl.Dense(896, name="fc_3")(lay2) lay3 = tfka.relu(lay3, alpha=0.01) #leaky_relu - advantage = tfkl.Dense(self.action_size * 2, name="fc_adv")(lay3) - advantage = tfkl.Dense(self.action_size, name="adv")(advantage) + lay4 = tfkl.Dense(512, name="fc_4")(lay3) + lay4 = tfka.relu(lay4, alpha=0.01) #leaky_relu - value = tfkl.Dense(self.action_size * 2, name="fc_val")(lay3) + advantage = tfkl.Dense(384, name="fc_adv")(lay4) + advantage = tfka.relu(advantage, alpha=0.01) #leaky_relu + advantage = tfkl.Dense(self.action_size, name="adv")(advantage) + advantage_mean = tf.math.reduce_mean(advantage, + axis=1, keepdims=True, + name="adv_mean") + advantage = tfkl.subtract([advantage, advantage_mean], + name="adv_subtract") + + value = tfkl.Dense(384, name="fc_val")(lay4) + value = tfka.relu(value, alpha=0.01) #leaky_relu value = tfkl.Dense(1, name="val")(value) - advantage_mean = tf.math.reduce_mean(advantage, axis=1, keepdims=True, name="adv_mean") - advantage = tfkl.subtract([advantage, advantage_mean], name="adv_subtract") Q = tf.math.add(value, advantage, name="Qout") self.model = tfk.Model(inputs=[input_layer], outputs=[Q], name=self.__class__.__name__) # Backwards pass - self.schedule = tfko.schedules.InverseTimeDecay(self.lr, self.lr_decay_steps, self.lr_decay_rate) - self.optimizer = tfko.Adam(learning_rate=self.schedule) + self.schedule = tfko.schedules.InverseTimeDecay(self.lr, + self.lr_decay_steps, + self.lr_decay_rate) + self.optimizer = tfko.Adam(learning_rate=self.schedule, clipnorm=1.0) def train_on_batch(self, x, y_true, sample_weight): with tf.GradientTape() as tape: @@ -69,10 +80,11 @@ def train_on_batch(self, x, y_true, sample_weight): y_pred = self.model(x) # Compute loss for each sample in the batch - batch_loss = self._clipped_batch_loss(y_true, y_pred) + batch_loss = self._batch_loss(y_true, y_pred) # Apply samples weights - tf_sample_weight = tf.convert_to_tensor(sample_weight, dtype=tf.float32) + tf_sample_weight = tf.convert_to_tensor(sample_weight, + dtype=tf.float32) batch_loss = tf.math.multiply(batch_loss, tf_sample_weight) # Compute mean scalar loss @@ -80,23 +92,27 @@ def train_on_batch(self, x, y_true, sample_weight): # Compute gradients grads = tape.gradient(loss, self.model.trainable_variables) + # Apply gradients - self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables)) + grad_pairs = zip(grads, self.model.trainable_variables) + self.optimizer.apply_gradients(grad_pairs) # Store LR self.train_lr = self.optimizer._decayed_lr('float32').numpy() # Return loss scalar return loss.numpy() - def _clipped_batch_loss(self, y_true, y_pred): + def _batch_loss(self, y_true, y_pred): sq_error = tf.math.square(y_true - y_pred, name="sq_error") - # We store it because that's the priorities vector for importance update - batch_sq_error = tf.math.reduce_sum(sq_error, axis=1, name="batch_sq_error") + # We store it because that's the priorities vector + # for importance update + batch_sq_error = tf.math.reduce_sum(sq_error, axis=1, + name="batch_sq_error") # Stored as numpy array since we are in eager mode self.batch_sq_error = batch_sq_error.numpy() - return tf.clip_by_value(batch_sq_error, 0.0, 1e3, name="batch_sq_error_clip") + return batch_sq_error def random_move(self): opt_policy = np.random.randint(0, self.action_size) @@ -135,5 +151,5 @@ def save_network(self, path): def load_network(self, path): # Load from a model.h5 file self.model.load_weights(path) - print("Succesfully loaded network from: {}".format(path)) + print("Successfully loaded network from: {}".format(path)) diff --git a/l2rpn_baselines/DoubleDuelingDQN/__init__.py b/l2rpn_baselines/DoubleDuelingDQN/__init__.py index f5ced0d..d309b42 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/__init__.py +++ b/l2rpn_baselines/DoubleDuelingDQN/__init__.py @@ -1,9 +1,11 @@ __all__ = [ "DoubleDuelingDQN", + "DoubleDuelingDQNConfig", "evaluate", "train" ] from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig from l2rpn_baselines.DoubleDuelingDQN.evaluate import evaluate from l2rpn_baselines.DoubleDuelingDQN.train import train diff --git a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py index fc9409e..2bffa07 100755 --- a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py +++ b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py @@ -17,7 +17,8 @@ from grid2op.Reward import * from grid2op.Action import * -from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as DDDQNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as D3QNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig from l2rpn_baselines.utils.save_log_gif import save_log_gif DEFAULT_LOGS_DIR = "./logs-evals" @@ -25,7 +26,7 @@ DEFAULT_NB_PROCESS = 1 DEFAULT_MAX_STEPS = -1 DEFAULT_NUM_FRAMES = 4 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Eval baseline DDDQN") @@ -62,21 +63,24 @@ def evaluate(env, nb_process=DEFAULT_NB_PROCESS, max_steps=DEFAULT_MAX_STEPS, num_frames=DEFAULT_NUM_FRAMES, - verbose=False, + verbose=DEFAULT_VERBOSE, save_gif=False): + # Set config + D3QNConfig.N_FRAMES = num_frames + D3QNConfig.VERBOSE = verbose + # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], True) runner_params = env.get_params_for_runner() - runner_params["verbose"] = args.verbose + runner_params["verbose"] = verbose # Create agent - agent = DDDQNAgent(env.observation_space, - env.action_space, - is_training=False, - num_frames=num_frames) + agent = D3QNAgent(env.observation_space, + env.action_space, + is_training=False) # Load weights from file agent.load(load_path) @@ -87,10 +91,11 @@ def evaluate(env, agentInstance=agent) # Print model summary - stringlist = [] - agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) - short_model_summary = "\n".join(stringlist) - print(short_model_summary) + if verbose: + stringlist = [] + agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + print(short_model_summary) # Run os.makedirs(logs_path, exist_ok=True) @@ -98,19 +103,23 @@ def evaluate(env, nb_episode=nb_episode, nb_process=nb_process, max_iter=max_steps, - pbar=True) + pbar=verbose) # Print summary - print("Evaluation summary:") - for _, chron_name, cum_reward, nb_time_step, max_ts in res: - msg_tmp = "chronics at: {}".format(chron_name) - msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) - msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) - print(msg_tmp) + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, + max_ts) + print(msg_tmp) if save_gif: save_log_gif(logs_path, res) + return res + if __name__ == "__main__": # Parse command line args = cli() diff --git a/l2rpn_baselines/DoubleDuelingDQN/train.py b/l2rpn_baselines/DoubleDuelingDQN/train.py index 9d2ff74..21f0f58 100755 --- a/l2rpn_baselines/DoubleDuelingDQN/train.py +++ b/l2rpn_baselines/DoubleDuelingDQN/train.py @@ -11,7 +11,8 @@ import argparse import tensorflow as tf -from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as DDDQNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as D3QNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig DEFAULT_NAME = "DoubleDuelingDQN" DEFAULT_SAVE_DIR = "./models" @@ -21,7 +22,7 @@ DEFAULT_N_FRAMES = 4 DEFAULT_BATCH_SIZE = 32 DEFAULT_LR = 1e-5 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Train baseline DDQN") @@ -66,20 +67,24 @@ def train(env, num_pre_training_steps = DEFAULT_PRE_STEPS, num_frames = DEFAULT_N_FRAMES, batch_size= DEFAULT_BATCH_SIZE, - learning_rate= DEFAULT_LR): + learning_rate= DEFAULT_LR, + verbose=DEFAULT_VERBOSE): + + # Set config + D3QNConfig.LR = learning_rate + D3QNConfig.N_FRAMES = num_frames + D3QNConfig.BATCH_SIZE = batch_size + D3QNConfig.VERBOSE = verbose # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices) > 0: tf.config.experimental.set_memory_growth(physical_devices[0], True) - agent = DDDQNAgent(env.observation_space, - env.action_space, - name=name, - is_training=True, - batch_size=batch_size, - num_frames=num_frames, - lr=learning_rate) + agent = D3QNAgent(env.observation_space, + env.action_space, + name=name, + is_training=True) if load_path is not None: agent.load(load_path) @@ -99,13 +104,9 @@ def train(env, import sys args = cli() - # Use custom params - params = Parameters() - params.MAX_SUB_CHANGED = 2 # Create grid2op game environement env = make(args.data_dir, - param=params, action_class=TopologyChangeAndDispatchAction, reward_class=CombinedScaledReward) @@ -114,13 +115,14 @@ def train(env, # Register custom reward for training cr = env.reward_helper.template_reward - cr.addReward("overflow", CloseToOverflowReward(), 50.0) - cr.addReward("game", GameplayReward(), 200.0) - cr.addReward("recolines", LinesReconnectedReward(), 50.0) + #cr.addReward("overflow", CloseToOverflowReward(), 1.0) + cr.addReward("game", GameplayReward(), 1.0) + #cr.addReward("recolines", LinesReconnectedReward(), 1.0) + cr.addReward("l2rpn", L2RPNReward(), 2.0/float(env.n_line)) # Initialize custom rewards cr.initialize(env) # Set reward range to something managable - cr.set_range(-10.0, 10.0) + cr.set_range(-1.0, 1.0) train(env, name = args.name, diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py index 1affec6..71000b6 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py @@ -16,28 +16,16 @@ from grid2op.Agent import AgentWithConverter from grid2op.Converter import IdToAct +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as cfg from l2rpn_baselines.DoubleDuelingRDQN.ExperienceBuffer import ExperienceBuffer from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN import DoubleDuelingRDQN_NN -INITIAL_EPSILON = 0.99 -FINAL_EPSILON = 0.01 -DECAY_EPSILON = 1024*32 -STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON -DISCOUNT_FACTOR = 0.99 -REPLAY_BUFFER_SIZE = 1024*4 -UPDATE_FREQ = 64 -UPDATE_TARGET_HARD_FREQ = -1 -UPDATE_TARGET_SOFT_TAU = 0.001 - class DoubleDuelingRDQN(AgentWithConverter): def __init__(self, observation_space, action_space, name=__name__, - trace_length=1, - batch_size=1, - is_training=False, - lr=1e-5): + is_training=False): # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) @@ -45,10 +33,10 @@ def __init__(self, # Store constructor params self.observation_space = observation_space self.name = name - self.trace_length = trace_length - self.batch_size = batch_size + self.trace_length = cfg.TRACE_LENGTH + self.batch_size = cfg.BATCH_SIZE self.is_training = is_training - self.lr = lr + self.lr = cfg.LR # Declare required vars self.Qmain = None @@ -78,7 +66,9 @@ def __init__(self, def _init_training(self): - self.exp_buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE, self.batch_size, self.trace_length) + self.exp_buffer = ExperienceBuffer(cfg.REPLAY_BUFFER_SIZE, + self.batch_size, + self.trace_length) self.done = True self.epoch_rewards = [] self.epoch_alive = [] @@ -110,17 +100,17 @@ def _register_experience(self, episode_exp, episode): def _save_hyperparameters(self, logpath, env, steps): r_instance = env.reward_helper.template_reward hp = { - "lr": self.lr, - "batch_size": self.batch_size, - "trace_len": self.trace_length, - "e_start": INITIAL_EPSILON, - "e_end": FINAL_EPSILON, - "e_decay": DECAY_EPSILON, - "discount": DISCOUNT_FACTOR, - "buffer_size": REPLAY_BUFFER_SIZE, - "update_freq": UPDATE_FREQ, - "update_hard": UPDATE_TARGET_HARD_FREQ, - "update_soft": UPDATE_TARGET_SOFT_TAU, + "lr": cfg.LR, + "batch_size": cfg.BATCH_SIZE, + "trace_len": cfg.TRACE_LENGTH, + "e_start": cfg.INITIAL_EPSILON, + "e_end": cfg.FINAL_EPSILON, + "e_decay": cfg.DECAY_EPSILON, + "discount": cfg.DISCOUNT_FACTOR, + "buffer_size": cfg.REPLAY_BUFFER_SIZE, + "update_freq": cfg.UPDATE_FREQ, + "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, + "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) @@ -153,7 +143,9 @@ def reset(self, observation): def my_act(self, state, reward, done=False): data_input = np.array(state) data_input.reshape(1, 1, self.observation_size) - a, _, m, c = self.Qmain.predict_move(data_input, self.mem_state, self.carry_state) + a, _, m, c = self.Qmain.predict_move(data_input, + self.mem_state, + self.carry_state) self.mem_state = m self.carry_state = c @@ -178,7 +170,7 @@ def train(self, env, num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 - epsilon = INITIAL_EPSILON + epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 episode = 0 @@ -204,17 +196,24 @@ def train(self, env, episode += 1 episode_exp = [] - if step % 1000 == 0: + if cfg.VERBOSE and step % 1000 == 0: print("Step [{}] -- Dropout [{}]".format(step, epsilon)) # Choose an action if step <= num_pre_training_steps: - a, m, c = self.Qmain.random_move(self.state, self.mem_state, self.carry_state) + a, m, c = self.Qmain.random_move(self.state, + self.mem_state, + self.carry_state) elif len(episode_exp) < self.trace_length: - a, m, c = self.Qmain.random_move(self.state, self.mem_state, self.carry_state) + a, m, c = self.Qmain.random_move(self.state, + self.mem_state, + self.carry_state) a = 0 # Do Nothing else: - a, _, m, c = self.Qmain.bayesian_move(self.state, self.mem_state, self.carry_state, epsilon) + a, _, m, c = self.Qmain.bayesian_move(self.state, + self.mem_state, + self.carry_state, + epsilon) # Update LSTM state self.mem_state = m @@ -233,31 +232,36 @@ def train(self, env, if step >= num_pre_training_steps: training_step = step - num_pre_training_steps # Slowly decay dropout rate - if epsilon > FINAL_EPSILON: - epsilon -= STEP_EPSILON - if epsilon < FINAL_EPSILON: - epsilon = FINAL_EPSILON + if epsilon > cfg.FINAL_EPSILON: + epsilon -= cfg.STEP_EPSILON + if epsilon < cfg.FINAL_EPSILON: + epsilon = cfg.FINAL_EPSILON # Perform training at given frequency - if step % UPDATE_FREQ == 0 and self.exp_buffer.can_sample(): + if step % cfg.UPDATE_FREQ == 0 and \ + self.exp_buffer.can_sample(): # Sample from experience buffer batch = self.exp_buffer.sample() # Perform training self._batch_train(batch, step, training_step) # Update target network towards primary network - if UPDATE_TARGET_SOFT_TAU > 0: - self.Qmain.update_target_soft(self.Qtarget.model, tau=UPDATE_TARGET_SOFT_TAU) - - # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely - if UPDATE_TARGET_HARD_FREQ > 0 and step % (UPDATE_FREQ * UPDATE_TARGET_HARD_FREQ) == 0: + if cfg.UPDATE_TARGET_SOFT_TAU > 0: + tau = cfg.UPDATE_TARGET_SOFT_TAU + self.Qmain.update_target_soft(self.Qtarget.model, tau) + + # Every UPDATE_TARGET_HARD_FREQ trainings, + # update target completely + if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ + step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) - print("Survived [{}] steps".format(alive_steps)) - print("Total reward [{}]".format(total_reward)) + if cfg.VERBOSE: + print("Survived [{}] steps".format(alive_steps)) + print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: @@ -286,9 +290,21 @@ def _batch_train(self, batch, step, training_step): m_data = m_data.reshape(self.batch_size, self.trace_length, input_size) t_data = np.vstack(batch[:, 4]) t_data = t_data.reshape(self.batch_size, self.trace_length, input_size) - q_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(m_data)] - q1_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] - q2_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] + q_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(m_data) + ] + q1_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] + q2_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] # Batch predict self.Qmain.trace_length.assign(self.trace_length) @@ -301,7 +317,8 @@ def _batch_train(self, batch, step, training_step): tf.summary.trace_on() # T Batch predict - Q, _, _ = self.Qmain.model.predict(q_input, batch_size = self.batch_size) + Q, _, _ = self.Qmain.model.predict(q_input, + batch_size = self.batch_size) ## Log graph once and disable graph logging if training_step == 0: @@ -309,8 +326,10 @@ def _batch_train(self, batch, step, training_step): tf.summary.trace_export(self.name + "-graph", step) # T+1 batch predict - Q1, _, _ = self.Qmain.model.predict(q1_input, batch_size=self.batch_size) - Q2, _, _ = self.Qtarget.model.predict(q2_input, batch_size=self.batch_size) + Q1, _, _ = self.Qmain.model.predict(q1_input, + batch_size=self.batch_size) + Q2, _, _ = self.Qtarget.model.predict(q2_input, + batch_size=self.batch_size) # Compute batch Double Q update to Qtarget for i in range(self.batch_size): @@ -321,7 +340,7 @@ def _batch_train(self, batch, step, training_step): d = batch[idx][3] Q[i, a] = r if d == False: - Q[i, a] += DISCOUNT_FACTOR * doubleQ + Q[i, a] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train batch_x = [batch_mem, batch_carry, m_data] @@ -330,8 +349,10 @@ def _batch_train(self, batch, step, training_step): loss = loss[0] # Log some useful metrics - if step % (UPDATE_FREQ * 2) == 0: - print("loss =", loss) + if step % (cfg.UPDATE_FREQ * 2) == 0: + if cfg.VERBOSE: + print("loss =", loss) + with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py new file mode 100644 index 0000000..321fc29 --- /dev/null +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py @@ -0,0 +1,41 @@ +import os +import json + +class DoubleDuelingRDQNConfig(): + """ + DoubleDuelingRDQN configurable hyperparameters as class attributes + """ + + INITIAL_EPSILON = 0.99 + FINAL_EPSILON = 0.01 + DECAY_EPSILON = 1024*32 + STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON + DISCOUNT_FACTOR = 0.99 + REPLAY_BUFFER_SIZE = 1024*4 + UPDATE_FREQ = 64 + UPDATE_TARGET_HARD_FREQ = -1 + UPDATE_TARGET_SOFT_TAU = 0.001 + TRACE_LENGTH = 8 + BATCH_SIZE = 32 + LR = 1e-5 + VERBOSE = True + + @staticmethod + def from_json(json_in_path): + with open(json_in_path, 'r') as fp: + conf_json = json.load(fp) + + for k,v in conf_json.items(): + if hasattr(DoubleDuelingDQNConfig, k): + setattr(DoubleDuelingDQNConfig, k, v) + + @staticmethod + def to_json(json_out_path): + conf_json = {} + for attr in dir(DoubleDuelingDQNConfig): + if attr.startswith('__') or callable(attr): + continue + conf_json[attr] = getattr(DoubleDuelingDQNConfig, attr) + + with open(json_out_path, 'w+') as fp: + json.dump(fp, conf_json, indent=2) diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py index 860b470..884a7ec 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py @@ -69,33 +69,36 @@ def construct_q_network(self): advantage = tfkl.Dense(64, name="fc_adv")(lstm_output) advantage = tf.nn.leaky_relu(advantage, alpha=0.01, name="leak_adv") advantage = tfkl.Dense(self.action_size, name="adv")(advantage) + advantage_mean = tf.math.reduce_mean(advantage, axis=1, + keepdims=True, name="adv_mean") + advantage = tfkl.subtract([advantage, advantage_mean], name="adv_sub") value = tfkl.Dense(64, name="fc_val")(lstm_output) value = tf.nn.leaky_relu(value, alpha=0.01, name="leak_val") value = tfkl.Dense(1, name="val")(value) - advantage_mean = tf.math.reduce_mean(advantage, axis=1, keepdims=True, name="adv_mean") - advantage = tfkl.subtract([advantage, advantage_mean], name="adv_sub") Q = tf.math.add(value, advantage, name="Qout") # Backwards pass - self.model = tfk.Model(inputs=[input_mem_state, input_carry_state, input_layer], - outputs=[Q, mem_s, carry_s], + model_inputs = [input_mem_state, input_carry_state, input_layer] + model_outputs = [Q, mem_s, carry_s] + self.model = tfk.Model(inputs=model_inputs, + outputs=model_outputs, name=self.__class__.__name__) losses = [ - self._clipped_mse_loss, + self._mse_loss, self._no_loss, self._no_loss ] - self.model.compile(loss=losses, optimizer=tfko.Adam(lr=self.lr, clipnorm=1.0)) + self.optimizer = tfko.Adam(lr=self.lr, clipnorm=1.0) + self.model.compile(loss=losses, optimizer=self.optimizer) def _no_loss(self, y_true, y_pred): return 0.0 - def _clipped_mse_loss(self, Qnext, Q): + def _mse_loss(self, Qnext, Q): loss = tf.math.reduce_mean(tf.math.square(Qnext - Q), name="loss_mse") - clipped_loss = tf.clip_by_value(loss, 0.0, 1e3, name="loss_clip") - return clipped_loss + return loss def bayesian_move(self, data, mem, carry, rate = 0.0): self.dropout_rate.assign(float(rate)) @@ -165,4 +168,4 @@ def save_network(self, path): def load_network(self, path): # nothing has changed self.model.load_weights(path) - print("Succesfully loaded network from: {}".format(path)) + print("Successfully loaded network from: {}".format(path)) diff --git a/l2rpn_baselines/DoubleDuelingRDQN/__init__.py b/l2rpn_baselines/DoubleDuelingRDQN/__init__.py index bf1698c..085c33f 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/__init__.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/__init__.py @@ -1,10 +1,12 @@ __all__ = [ "DoubleDuelingRDQN", + "DoubleDuelingRDQNConfig", "evaluate", "train" ] from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig from l2rpn_baselines.DoubleDuelingRDQN.evaluate import evaluate from l2rpn_baselines.DoubleDuelingRDQN.train import train diff --git a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py index 105006f..490eb6c 100755 --- a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py @@ -17,6 +17,7 @@ from grid2op.Reward import * from grid2op.Action import * +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent from l2rpn_baselines.utils.save_log_gif import save_log_gif @@ -24,7 +25,7 @@ DEFAULT_NB_EPISODE = 1 DEFAULT_NB_PROCESS = 1 DEFAULT_MAX_STEPS = -1 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Eval baseline DDDQN") @@ -57,7 +58,7 @@ def evaluate(env, nb_episode=DEFAULT_NB_EPISODE, nb_process=DEFAULT_NB_PROCESS, max_steps=DEFAULT_MAX_STEPS, - verbose=False, + verbose=DEFAULT_VERBOSE, save_gif=False): # Limit gpu usage @@ -65,7 +66,7 @@ def evaluate(env, tf.config.experimental.set_memory_growth(physical_devices[0], True) runner_params = env.get_params_for_runner() - runner_params["verbose"] = args.verbose + runner_params["verbose"] = verbose # Run # Create agent @@ -82,10 +83,11 @@ def evaluate(env, agentInstance=agent) # Print model summary - stringlist = [] - agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) - short_model_summary = "\n".join(stringlist) - print(short_model_summary) + if verbose: + stringlist = [] + agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + print(short_model_summary) # Run os.makedirs(logs_path, exist_ok=True) @@ -93,19 +95,22 @@ def evaluate(env, nb_episode=nb_episode, nb_process=nb_process, max_iter=max_steps, - pbar=True) + pbar=verbose) # Print summary - print("Evaluation summary:") - for _, chron_name, cum_reward, nb_time_step, max_ts in res: - msg_tmp = "chronics at: {}".format(chron_name) - msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) - msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) - print(msg_tmp) + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, + max_ts) + print(msg_tmp) if save_gif: save_log_gif(logs_path, res) + return res if __name__ == "__main__": # Parse command line diff --git a/l2rpn_baselines/DoubleDuelingRDQN/train.py b/l2rpn_baselines/DoubleDuelingRDQN/train.py index 1696129..d75f216 100755 --- a/l2rpn_baselines/DoubleDuelingRDQN/train.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/train.py @@ -15,6 +15,7 @@ from grid2op.Reward import * from grid2op.Action import * +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent DEFAULT_NAME = "DoubleDuelingRDQN" @@ -25,7 +26,7 @@ DEFAULT_TRACE_LEN = 12 DEFAULT_BATCH_SIZE = 32 DEFAULT_LR = 1e-5 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Train baseline DDQN") @@ -72,7 +73,14 @@ def train(env, num_pre_training_steps=DEFAULT_PRE_STEPS, trace_length=DEFAULT_TRACE_LEN, batch_size=DEFAULT_BATCH_SIZE, - learning_rate=DEFAULT_LR): + learning_rate=DEFAULT_LR, + verbose=DEFAULT_VERBOSE): + + # Set config + RDQNConfig.TRACE_LENGTH = trace_length + RDQNConfig.BATCH_SIZE = batch_size + RDQNConfig.LR = learning_rate + RDQNConfig.VERBOSE = verbose # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') @@ -82,10 +90,7 @@ def train(env, agent = RDQNAgent(env.observation_space, env.action_space, name=name, - is_training=True, - batch_size=batch_size, - trace_length=trace_length, - lr=learning_rate) + is_training=True) if load_path is not None: agent.load(load_path) diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN.py b/l2rpn_baselines/SliceRDQN/SliceRDQN.py index 3d3fdc4..df093f2 100644 --- a/l2rpn_baselines/SliceRDQN/SliceRDQN.py +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN.py @@ -17,30 +17,17 @@ from grid2op.Converter import IdToAct from l2rpn_baselines.SliceRDQN.ExperienceBuffer import ExperienceBuffer +from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config as cfg from l2rpn_baselines.SliceRDQN.SliceRDQN_NN import SliceRDQN_NN from l2rpn_baselines.SliceRDQN.slice_util import * -INITIAL_EPSILON = 0.80 -FINAL_EPSILON = 0.01 -DECAY_EPSILON = 1024*256 -STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON -DISCOUNT_FACTOR = 0.99 -REPLAY_BUFFER_SIZE = 1024*8 -UPDATE_FREQ = 512 -UPDATE_TARGET_HARD_FREQ = -1 -UPDATE_TARGET_SOFT_TAU = 0.001 -INPUT_BIAS = 3.0 -SUFFLE_FREQ = 1000 class SliceRDQN(AgentWithConverter): def __init__(self, observation_space, action_space, name=__name__, - trace_length=1, - batch_size=1, - is_training=False, - lr=1e-5): + is_training=False): # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) @@ -48,10 +35,10 @@ def __init__(self, # Store constructor params self.observation_space = observation_space self.name = name - self.trace_length = trace_length - self.batch_size = batch_size + self.trace_length = cfg.TRACE_LENGTH + self.batch_size = cfg.BATCH_SIZE self.is_training = is_training - self.lr = lr + self.lr = cfg.LR # Declare required vars self.Qmain = None @@ -66,7 +53,7 @@ def __init__(self, self.epoch_rewards = None self.epoch_alive = None self.Qtarget = None - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON # Compute dimensions from intial state self.action_size = self.action_space.n @@ -100,7 +87,9 @@ def __init__(self, def _init_training(self): - self.exp_buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE, self.batch_size, self.trace_length) + self.exp_buffer = ExperienceBuffer(cfg.REPLAY_BUFFER_SIZE, + self.batch_size, + self.trace_length) self.done = True self.epoch_rewards = [] self.epoch_alive = [] @@ -136,15 +125,15 @@ def _save_hyperparameters(self, logpath, env, steps): "lr": self.lr, "batch_size": self.batch_size, "trace_len": self.trace_length, - "e_start": INITIAL_EPSILON, - "e_end": FINAL_EPSILON, - "e_decay": DECAY_EPSILON, - "discount": DISCOUNT_FACTOR, - "buffer_size": REPLAY_BUFFER_SIZE, - "update_freq": UPDATE_FREQ, - "update_hard": UPDATE_TARGET_HARD_FREQ, - "update_soft": UPDATE_TARGET_SOFT_TAU, - "input_bias": INPUT_BIAS, + "e_start": cfg.INITIAL_EPSILON, + "e_end": cfg.FINAL_EPSILON, + "e_decay": cfg.DECAY_EPSILON, + "discount": cfg.DISCOUNT_FACTOR, + "buffer_size": cfg.REPLAY_BUFFER_SIZE, + "update_freq": cfg.UPDATE_FREQ, + "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, + "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, + "input_bias": cfg.INPUT_BIAS, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) @@ -154,7 +143,7 @@ def _save_hyperparameters(self, logpath, env, steps): ## Agent Interface def convert_obs(self, observation): - return convert_obs_pad(observation, bias=INPUT_BIAS) + return convert_obs_pad(observation, bias=cfg.INPUT_BIAS) def convert_act(self, action): return super().convert_act(action) @@ -191,7 +180,7 @@ def train(self, env, num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 episode = 0 @@ -209,10 +198,16 @@ def train(self, env, while step < num_steps: # New episode if self.done: - if episode % SUFFLE_FREQ == 0: + if episode % cfg.SUFFLE_FREQ == 0: # shuffle the data every now and then - env.chronics_handler.shuffle( - shuffler=lambda x: x[np.random.choice(len(x), size=len(x), replace=False)]) + def shuff(x): + s = np.random.choice(len(x), + size=len(x), + replace=False) + return x[s] + + env.chronics_handler.shuffle(shuffler=shuff) + new_obs = env.reset() # This shouldn't raise self.reset(new_obs) # Push current episode experience to experience buffer @@ -221,7 +216,7 @@ def train(self, env, episode += 1 episode_exp = [] - if step % SUFFLE_FREQ == 0: + if cfg.VERBOSE and step % cfg.SUFFLE_FREQ == 0: print("Step [{}] -- Dropout [{}]".format(step, self.epsilon)) # Choose an action @@ -252,34 +247,36 @@ def train(self, env, if step >= num_pre_training_steps: training_step = step - num_pre_training_steps # Slowly decay dropout rate - if self.epsilon > FINAL_EPSILON: - self.epsilon -= STEP_EPSILON - if self.epsilon < FINAL_EPSILON: - self.epsilon = FINAL_EPSILON + if self.epsilon > cfg.FINAL_EPSILON: + self.epsilon -= cfg.STEP_EPSILON + if self.epsilon < cfg.FINAL_EPSILON: + self.epsilon = cfg.FINAL_EPSILON # Perform training at given frequency - if step % UPDATE_FREQ == 0 and self.exp_buffer.can_sample(): + if step % cfg.UPDATE_FREQ == 0 and \ + self.exp_buffer.can_sample(): # Sample from experience buffer batch = self.exp_buffer.sample() # Perform training self._batch_train(batch, training_step, step) # Update target network towards primary network - if UPDATE_TARGET_SOFT_TAU > 0: - tau = UPDATE_TARGET_SOFT_TAU + if cfg.UPDATE_TARGET_SOFT_TAU > 0: + tau = cfg.UPDATE_TARGET_SOFT_TAU self.Qmain.update_target_soft(self.Qtarget.model, tau) # Every UPDATE_TARGET_HARD_FREQ trainings # update target completely - if UPDATE_TARGET_HARD_FREQ > 0 and \ - step % (UPDATE_FREQ * UPDATE_TARGET_HARD_FREQ) == 0: + if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ + step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) - print("Survived [{}] steps".format(alive_steps)) - print("Total reward [{}]".format(total_reward)) + if cfg.VERBOSE: + print("Survived [{}] steps".format(alive_steps)) + print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: @@ -300,17 +297,34 @@ def train(self, env, def _batch_train(self, batch, training_step, step): """Trains network to fit given parameters""" Q = np.zeros((self.batch_size, self.action_size)) - batch_mem = np.zeros((self.batch_size, self.n_slices, self.Qmain.h_size)) - batch_carry = np.zeros((self.batch_size, self.n_slices, self.Qmain.h_size)) - - input_shape = (self.batch_size, self.trace_length) + self.observation_shape + batch_mem = np.zeros((self.batch_size, + self.n_slices, + self.Qmain.h_size)) + batch_carry = np.zeros((self.batch_size, + self.n_slices, + self.Qmain.h_size)) + + input_shape = (self.batch_size, + self.trace_length) + self.observation_shape m_data = np.vstack(batch[:, 0]) m_data = m_data.reshape(input_shape) t_data = np.vstack(batch[:, 4]) t_data = t_data.reshape(input_shape) - q_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(m_data)] - q1_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] - q2_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] + q_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(m_data) + ] + q1_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] + q2_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] # Batch predict self.Qmain.trace_length.assign(self.trace_length) @@ -323,7 +337,8 @@ def _batch_train(self, batch, training_step, step): tf.summary.trace_on() # T batch predict - Q, _, _ = self.Qmain.model.predict(q_input, batch_size = self.batch_size) + Q, _, _ = self.Qmain.model.predict(q_input, + batch_size = self.batch_size) ## Log graph once and disable graph logging if training_step == 0: @@ -331,8 +346,10 @@ def _batch_train(self, batch, training_step, step): tf.summary.trace_export(self.name + "-graph", step) # T+1 batch predict - Q1, _, _ = self.Qmain.model.predict(q1_input, batch_size = self.batch_size) - Q2, _, _ = self.Qtarget.model.predict(q2_input, batch_size = self.batch_size) + Q1, _, _ = self.Qmain.model.predict(q1_input, + batch_size = self.batch_size) + Q2, _, _ = self.Qtarget.model.predict(q2_input, + batch_size = self.batch_size) # Compute batch Double Q update to Qtarget for i in range(self.batch_size): @@ -343,7 +360,7 @@ def _batch_train(self, batch, training_step, step): d = batch[idx][3] Q[i, a] = r if d == False: - Q[i, a] += DISCOUNT_FACTOR * doubleQ + Q[i, a] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train batch_x = [batch_mem, batch_carry, m_data] @@ -351,7 +368,8 @@ def _batch_train(self, batch, training_step, step): loss = self.Qmain.model.train_on_batch(batch_x, batch_y) loss = loss[0] - print("loss =", loss) + if cfg.VERBOSE: + print("loss =", loss) with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py new file mode 100644 index 0000000..bd61ce5 --- /dev/null +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py @@ -0,0 +1,39 @@ +import os +import json + +class SliceRDQN_Config(): + INITIAL_EPSILON = 0.80 + FINAL_EPSILON = 0.01 + DECAY_EPSILON = 1024*256 + STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON + DISCOUNT_FACTOR = 0.99 + REPLAY_BUFFER_SIZE = 1024*8 + UPDATE_FREQ = 512 + UPDATE_TARGET_HARD_FREQ = -1 + UPDATE_TARGET_SOFT_TAU = 0.001 + INPUT_BIAS = 3.0 + SUFFLE_FREQ = 1000 + TRACE_LENGTH = 8 + BATCH_SIZE = 32 + LR = 1e-5 + VERBOSE = True + + @staticmethod + def from_json(json_in_path): + with open(json_in_path, 'r') as fp: + conf_json = json.load(fp) + + for k,v in conf_json.items(): + if hasattr(DoubleDuelingDQNConfig, k): + setattr(DoubleDuelingDQNConfig, k, v) + + @staticmethod + def to_json(json_out_path): + conf_json = {} + for attr in dir(DoubleDuelingDQNConfig): + if attr.startswith('__') or callable(attr): + continue + conf_json[attr] = getattr(DoubleDuelingDQNConfig, attr) + + with open(json_out_path, 'w+') as fp: + json.dump(fp, conf_json, indent=2) diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py b/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py index 2d0d25e..3c9d832 100644 --- a/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py @@ -243,5 +243,5 @@ def save_network(self, path): def load_network(self, path): # nothing has changed self.model.load_weights(path) - print("Succesfully loaded network from: {}".format(path)) + print("Successfully loaded network from: {}".format(path)) diff --git a/l2rpn_baselines/SliceRDQN/__init__.py b/l2rpn_baselines/SliceRDQN/__init__.py index dc89d89..b6e9fa3 100644 --- a/l2rpn_baselines/SliceRDQN/__init__.py +++ b/l2rpn_baselines/SliceRDQN/__init__.py @@ -1,10 +1,12 @@ __all__ = [ "SliceRDQN", + "SliceRDQN_Config", "evaluate", "train" ] from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN +from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config from l2rpn_baselines.SliceRDQN.evaluate import evaluate from l2rpn_baselines.SliceRDQN.train import train diff --git a/l2rpn_baselines/SliceRDQN/evaluate.py b/l2rpn_baselines/SliceRDQN/evaluate.py index 5680e75..b977e78 100755 --- a/l2rpn_baselines/SliceRDQN/evaluate.py +++ b/l2rpn_baselines/SliceRDQN/evaluate.py @@ -24,7 +24,7 @@ DEFAULT_NB_EPISODE = 1 DEFAULT_NB_PROCESS = 1 DEFAULT_MAX_STEPS = -1 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Eval baseline DDDQN") @@ -57,7 +57,7 @@ def evaluate(env, nb_episode=DEFAULT_NB_EPISODE, nb_process=DEFAULT_NB_PROCESS, max_steps=DEFAULT_MAX_STEPS, - verbose=False, + verbose=DEFAULT_VERBOSE, save_gif=False): # Limit gpu usage @@ -66,7 +66,7 @@ def evaluate(env, tf.config.experimental.set_memory_growth(physical_devices[0], True) runner_params = env.get_params_for_runner() - runner_params["verbose"] = args.verbose + runner_params["verbose"] = verbose # Run # Create agent @@ -83,10 +83,11 @@ def evaluate(env, agentInstance=agent) # Print model summary - stringlist = [] - agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) - short_model_summary = "\n".join(stringlist) - print(short_model_summary) + if verbose: + stringlist = [] + agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + print(short_model_summary) # Run os.makedirs(logs_path, exist_ok=True) @@ -94,19 +95,23 @@ def evaluate(env, nb_episode=nb_episode, nb_process=nb_process, max_iter=max_steps, - pbar=True) + pbar=verbose) # Print summary - print("Evaluation summary:") - for _, chron_name, cum_reward, nb_time_step, max_ts in res: - msg_tmp = "chronics at: {}".format(chron_name) - msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) - msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) - print(msg_tmp) + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, + max_ts) + print(msg_tmp) if save_gif: save_log_gif(logs_path, res) + return res + if __name__ == "__main__": # Parse command line diff --git a/l2rpn_baselines/SliceRDQN/train.py b/l2rpn_baselines/SliceRDQN/train.py index 56d136e..a5a8c0d 100755 --- a/l2rpn_baselines/SliceRDQN/train.py +++ b/l2rpn_baselines/SliceRDQN/train.py @@ -17,6 +17,7 @@ from grid2op.Parameters import Parameters from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN as RDQNAgent +from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config as RDQNConfig DEFAULT_NAME = "SliceRDQN" DEFAULT_SAVE_DIR = "./models" @@ -26,7 +27,7 @@ DEFAULT_TRACE_LEN = 12 DEFAULT_BATCH_SIZE = 32 DEFAULT_LR = 1e-5 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Train baseline DDQN") @@ -34,7 +35,8 @@ def cli(): # Paths parser.add_argument("--name", required=False, default="SliceRDQN_ls", help="The name of the model") - parser.add_argument("--data_dir", required=False, default="l2rpn_case14_sandbox", + parser.add_argument("--data_dir", required=False, + default="l2rpn_case14_sandbox", help="Path to the dataset root directory") parser.add_argument("--save_dir", required=False, default=DEFAULT_SAVE_DIR, type=str, @@ -73,7 +75,14 @@ def train(env, num_pre_training_steps=DEFAULT_PRE_STEPS, trace_length=DEFAULT_TRACE_LEN, batch_size=DEFAULT_BATCH_SIZE, - learning_rate=DEFAULT_LR): + learning_rate=DEFAULT_LR, + verbose=DEFAULT_VERBOSE): + + # Set config + RDQNConfig.LR = learning_rate + RDQNConfig.BATCH_SIZE = batch_size + RDQNConfig.TRACE_LENGTH = trace_length + RDQNConfig.VERBOSE = verbose # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') @@ -83,10 +92,7 @@ def train(env, agent = RDQNAgent(env.observation_space, env.action_space, name=name, - is_training=True, - batch_size=batch_size, - trace_length=trace_length, - lr=learning_rate) + is_training=True) if load_path is not None: agent.load(load_path) diff --git a/l2rpn_baselines/test/test_train_eval.py b/l2rpn_baselines/test/test_train_eval.py index 4c549bb..eec3f6c 100644 --- a/l2rpn_baselines/test/test_train_eval.py +++ b/l2rpn_baselines/test/test_train_eval.py @@ -16,12 +16,21 @@ from l2rpn_baselines.utils import TrainingParam, NNParam from l2rpn_baselines.DeepQSimple import train as train_dqn from l2rpn_baselines.DeepQSimple import evaluate as eval_dqn -from l2rpn_baselines.DuelQSimple import train as train_d3qn -from l2rpn_baselines.DuelQSimple import evaluate as eval_d3qn +from l2rpn_baselines.DuelQSimple import train as train_d3qs +from l2rpn_baselines.DuelQSimple import evaluate as eval_d3qs from l2rpn_baselines.SAC import train as train_sac from l2rpn_baselines.SAC import evaluate as eval_sac from l2rpn_baselines.DuelQLeapNet import train as train_leap from l2rpn_baselines.DuelQLeapNet import evaluate as eval_leap +from l2rpn_baselines.DoubleDuelingDQN import train as train_d3qn +from l2rpn_baselines.DoubleDuelingDQN import evaluate as eval_d3qn +from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQNConfig as d3qn_cfg +from l2rpn_baselines.DoubleDuelingRDQN import train as train_rqn +from l2rpn_baselines.DoubleDuelingRDQN import evaluate as eval_rqn +from l2rpn_baselines.DoubleDuelingRDQN import DoubleDuelingRDQNConfig as rdqn_cfg +from l2rpn_baselines.SliceRDQN import train as train_srqn +from l2rpn_baselines.SliceRDQN import evaluate as eval_srqn +from l2rpn_baselines.SliceRDQN import SliceRDQN_Config as srdqn_cfg os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -105,7 +114,7 @@ def test_train_eval(self): "set_topo_vect": False } nm_ = "AnneOnymous" - train_d3qn(env, + train_d3qs(env, name=nm_, iterations=100, save_path=tmp_dir, @@ -117,7 +126,7 @@ def test_train_eval(self): kwargs_converters=kwargs_converters, kwargs_archi=kwargs_archi) - baseline_2 = eval_d3qn(env, + baseline_2 = eval_d3qs(env, name=nm_, load_path=tmp_dir, logs_path=tmp_dir, @@ -248,6 +257,113 @@ def test_train_eval(self): verbose=False, save_gif=False) +class TestD3QN(unittest.TestCase): + def test_train_eval(self): + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + nm_ = "test_D3QN" + + d3qn_cfg.INITIAL_EPISLON = 1.0 + d3qn_cfg.FINAL_EPISLON = 0.01 + d3qn_cfg.EPISLON_DECAY = 20 + d3qn_cfg.UPDATE_FREQ = 16 + + train_d3qn(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_path=tmp_dir, + learning_rate=1e-4, + verbose=False, + num_pre_training_steps=32, + num_frames=4, + batch_size=8) + + model_path = os.path.join(tmp_dir, nm_ + ".h5") + eval_res = eval_d3qn(env, + load_path=model_path, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=10, + verbose=False, + save_gif=False) + + assert eval_res is not None + +class TestRDQN(unittest.TestCase): + def test_train_eval(self): + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + nm_ = "test_RDQN" + rdqn_cfg.INITIAL_EPISLON = 1.0 + rdqn_cfg.FINAL_EPISLON = 0.01 + rdqn_cfg.EPISLON_DECAY = 20 + rdqn_cfg.UPDATE_FREQ = 16 + + train_rqn(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_path=tmp_dir, + learning_rate=1e-4, + verbose=False, + num_pre_training_steps=16, + batch_size=8) + + model_path = os.path.join(tmp_dir, nm_ + ".tf") + eval_res = eval_rqn(env, + load_path=model_path, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=10, + verbose=False, + save_gif=False) + + assert eval_res is not None + +class TestSRDQN(unittest.TestCase): + def test_train_eval(self): + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + nm_ = "test_SRDQN" + srdqn_cfg.INITIAL_EPISLON = 1.0 + srdqn_cfg.FINAL_EPISLON = 0.01 + srdqn_cfg.EPISLON_DECAY = 20 + srdqn_cfg.UPDATE_FREQ = 16 + + train_srqn(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_path=tmp_dir, + learning_rate=1e-4, + verbose=False, + num_pre_training_steps=32, + batch_size=8) + + model_path = os.path.join(tmp_dir, nm_ + ".tf") + eval_res = eval_srqn(env, + load_path=model_path, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=10, + verbose=False, + save_gif=False) + + assert eval_res is not None + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()