From 68654e21ccecdd216e5f9da227de79c3603e8548 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 19 May 2020 15:49:58 +0200 Subject: [PATCH 01/24] Link to experimental WolperGrid model --- .gitmodules | 3 +++ l2rpn_baselines/WolperGrid | 1 + 2 files changed, 4 insertions(+) create mode 160000 l2rpn_baselines/WolperGrid diff --git a/.gitmodules b/.gitmodules index c5c7102..d86d827 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "l2rpn_baselines/Geirina"] path = l2rpn_baselines/Geirina url = https://github.com/djmax008/GEIRINA_baseline +[submodule "l2rpn_baselines/WolperGrid"] + path = l2rpn_baselines/WolperGrid + url = git@github.com:Tezirg/WolperGrid.git diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid new file mode 160000 index 0000000..3802a5e --- /dev/null +++ b/l2rpn_baselines/WolperGrid @@ -0,0 +1 @@ +Subproject commit 3802a5e9955f92cc232f3e80e39bb14f257c7bfc From a19c66934db6e4dab8264305db9691ef2a560250 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 2 Jun 2020 11:44:48 +0200 Subject: [PATCH 02/24] D3QN adds actions filtering --- .../DoubleDuelingDQN/DoubleDuelingDQN.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index 59482df..c59399c 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -44,6 +44,11 @@ def __init__(self, AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) self.obs_space = observation_space + + # Filter + print("Actions filtering...") + self.action_space.filter_action(self._filter_action) + print("..Done") # Store constructor params self.name = name @@ -82,6 +87,22 @@ def __init__(self, if self.is_training: self._init_training() + def _filter_action(self, action): + MAX_ELEM = 2 + act_dict = action.impact_on_objects() + elem = 0 + elem += act_dict["force_line"]["reconnections"]["count"] + elem += act_dict["force_line"]["disconnections"]["count"] + elem += act_dict["switch_line"]["count"] + elem += len(act_dict["topology"]["bus_switch"]) + elem += len(act_dict["topology"]["assigned_bus"]) + elem += len(act_dict["topology"]["disconnect_bus"]) + elem += len(act_dict["redispatch"]["generators"]) + + if elem <= MAX_ELEM: + return True + return False + def _init_training(self): self.epsilon = INITIAL_EPSILON self.frames2 = [] From 3f56fab4de2b83641a3a376c5420e40514481e07 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 2 Jun 2020 11:58:27 +0200 Subject: [PATCH 03/24] Update WolpG head --- l2rpn_baselines/WolperGrid | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid index 3802a5e..f5820e1 160000 --- a/l2rpn_baselines/WolperGrid +++ b/l2rpn_baselines/WolperGrid @@ -1 +1 @@ -Subproject commit 3802a5e9955f92cc232f3e80e39bb14f257c7bfc +Subproject commit f5820e1678675c8bae9870f8467c524b6cdec248 From ea6b380f03743dd104d73f77e69ef5547f29b9da Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 2 Jun 2020 15:14:16 +0200 Subject: [PATCH 04/24] Update WG implementation to head: Runnable at least --- l2rpn_baselines/WolperGrid | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid index f5820e1..eae7b06 160000 --- a/l2rpn_baselines/WolperGrid +++ b/l2rpn_baselines/WolperGrid @@ -1 +1 @@ -Subproject commit f5820e1678675c8bae9870f8467c524b6cdec248 +Subproject commit eae7b06f89cb4ce283a4f8dc4d283578cdfa13f2 From 1e4d2a19e4dd91a6f662d116b46de8d94b16732c Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 4 Jun 2020 09:46:28 +0200 Subject: [PATCH 05/24] D3QN & RDQN: Do not clip loss --- .../DoubleDuelingDQN/DoubleDuelingDQN.py | 12 ++--- .../DoubleDuelingDQN/DoubleDuelingDQN_NN.py | 48 ++++++++++++------- l2rpn_baselines/DoubleDuelingDQN/train.py | 12 ++--- .../DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py | 21 ++++---- 4 files changed, 54 insertions(+), 39 deletions(-) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index c59399c..9ec8721 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -17,18 +17,18 @@ from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN from l2rpn_baselines.DoubleDuelingDQN.prioritized_replay_buffer import PrioritizedReplayBuffer -LR_DECAY_STEPS = 1024*32 +LR_DECAY_STEPS = 1024*64 LR_DECAY_RATE = 0.95 INITIAL_EPSILON = 0.99 FINAL_EPSILON = 0.001 -DECAY_EPSILON = 1024*32 -DISCOUNT_FACTOR = 0.99 +DECAY_EPSILON = 1024*64 +DISCOUNT_FACTOR = 0.98 PER_CAPACITY = 1024*64 PER_ALPHA = 0.7 PER_BETA = 0.5 -UPDATE_FREQ = 64 -UPDATE_TARGET_HARD_FREQ = 16 -UPDATE_TARGET_SOFT_TAU = -1 +UPDATE_FREQ = 28 +UPDATE_TARGET_HARD_FREQ = -1 +UPDATE_TARGET_SOFT_TAU = 1e-3 class DoubleDuelingDQN(AgentWithConverter): diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py index 2d98343..16701eb 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py @@ -36,32 +36,43 @@ def __init__(self, self.construct_q_network() def construct_q_network(self): - input_layer = tfk.Input(shape = (self.observation_size * self.num_frames,), name="input_obs") + input_shape = (self.observation_size * self.num_frames,) + input_layer = tfk.Input(shape = input_shape, name="input_obs") lay1 = tfkl.Dense(self.observation_size * 2, name="fc_1")(input_layer) lay1 = tfka.relu(lay1, alpha=0.01) #leaky_relu lay2 = tfkl.Dense(self.observation_size, name="fc_2")(lay1) lay2 = tfka.relu(lay2, alpha=0.01) #leaky_relu - lay3 = tfkl.Dense(self.action_size * 3, name="fc_3")(lay2) + lay3 = tfkl.Dense(self.action_size * 2 + 512, name="fc_3")(lay2) lay3 = tfka.relu(lay3, alpha=0.01) #leaky_relu - advantage = tfkl.Dense(self.action_size * 2, name="fc_adv")(lay3) - advantage = tfkl.Dense(self.action_size, name="adv")(advantage) + lay4 = tfkl.Dense(self.action_size * 2 + 256, name="fc_4")(lay3) + lay4 = tfka.relu(lay4, alpha=0.01) #leaky_relu - value = tfkl.Dense(self.action_size * 2, name="fc_val")(lay3) + advantage = tfkl.Dense(self.action_size * 2, name="fc_adv")(lay4) + advantage = tfka.relu(advantage, alpha=0.01) #leaky_relu + advantage = tfkl.Dense(self.action_size, name="adv")(advantage) + advantage_mean = tf.math.reduce_mean(advantage, + axis=1, keepdims=True, + name="adv_mean") + advantage = tfkl.subtract([advantage, advantage_mean], + name="adv_subtract") + + value = tfkl.Dense(self.action_size * 2, name="fc_val")(lay4) + value = tfka.relu(value, alpha=0.01) #leaky_relu value = tfkl.Dense(1, name="val")(value) - advantage_mean = tf.math.reduce_mean(advantage, axis=1, keepdims=True, name="adv_mean") - advantage = tfkl.subtract([advantage, advantage_mean], name="adv_subtract") Q = tf.math.add(value, advantage, name="Qout") self.model = tfk.Model(inputs=[input_layer], outputs=[Q], name=self.__class__.__name__) # Backwards pass - self.schedule = tfko.schedules.InverseTimeDecay(self.lr, self.lr_decay_steps, self.lr_decay_rate) - self.optimizer = tfko.Adam(learning_rate=self.schedule) + self.schedule = tfko.schedules.InverseTimeDecay(self.lr, + self.lr_decay_steps, + self.lr_decay_rate) + self.optimizer = tfko.Adam(learning_rate=self.schedule, clipnorm=1.0) def train_on_batch(self, x, y_true, sample_weight): with tf.GradientTape() as tape: @@ -69,10 +80,11 @@ def train_on_batch(self, x, y_true, sample_weight): y_pred = self.model(x) # Compute loss for each sample in the batch - batch_loss = self._clipped_batch_loss(y_true, y_pred) + batch_loss = self._batch_loss(y_true, y_pred) # Apply samples weights - tf_sample_weight = tf.convert_to_tensor(sample_weight, dtype=tf.float32) + tf_sample_weight = tf.convert_to_tensor(sample_weight, + dtype=tf.float32) batch_loss = tf.math.multiply(batch_loss, tf_sample_weight) # Compute mean scalar loss @@ -80,23 +92,27 @@ def train_on_batch(self, x, y_true, sample_weight): # Compute gradients grads = tape.gradient(loss, self.model.trainable_variables) + # Apply gradients - self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables)) + grad_pairs = zip(grads, self.model.trainable_variables) + self.optimizer.apply_gradients(grad_pairs) # Store LR self.train_lr = self.optimizer._decayed_lr('float32').numpy() # Return loss scalar return loss.numpy() - def _clipped_batch_loss(self, y_true, y_pred): + def _batch_loss(self, y_true, y_pred): sq_error = tf.math.square(y_true - y_pred, name="sq_error") - # We store it because that's the priorities vector for importance update - batch_sq_error = tf.math.reduce_sum(sq_error, axis=1, name="batch_sq_error") + # We store it because that's the priorities vector + # for importance update + batch_sq_error = tf.math.reduce_sum(sq_error, axis=1, + name="batch_sq_error") # Stored as numpy array since we are in eager mode self.batch_sq_error = batch_sq_error.numpy() - return tf.clip_by_value(batch_sq_error, 0.0, 1e3, name="batch_sq_error_clip") + return batch_sq_error def random_move(self): opt_policy = np.random.randint(0, self.action_size) diff --git a/l2rpn_baselines/DoubleDuelingDQN/train.py b/l2rpn_baselines/DoubleDuelingDQN/train.py index 9d2ff74..8085016 100755 --- a/l2rpn_baselines/DoubleDuelingDQN/train.py +++ b/l2rpn_baselines/DoubleDuelingDQN/train.py @@ -99,13 +99,9 @@ def train(env, import sys args = cli() - # Use custom params - params = Parameters() - params.MAX_SUB_CHANGED = 2 # Create grid2op game environement env = make(args.data_dir, - param=params, action_class=TopologyChangeAndDispatchAction, reward_class=CombinedScaledReward) @@ -114,13 +110,13 @@ def train(env, # Register custom reward for training cr = env.reward_helper.template_reward - cr.addReward("overflow", CloseToOverflowReward(), 50.0) - cr.addReward("game", GameplayReward(), 200.0) - cr.addReward("recolines", LinesReconnectedReward(), 50.0) + cr.addReward("overflow", CloseToOverflowReward(), 1.0) + cr.addReward("game", GameplayReward(), 2.0) + cr.addReward("recolines", LinesReconnectedReward(), 1.0) # Initialize custom rewards cr.initialize(env) # Set reward range to something managable - cr.set_range(-10.0, 10.0) + cr.set_range(-1.0, 1.0) train(env, name = args.name, diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py index 860b470..367265e 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py @@ -69,33 +69,36 @@ def construct_q_network(self): advantage = tfkl.Dense(64, name="fc_adv")(lstm_output) advantage = tf.nn.leaky_relu(advantage, alpha=0.01, name="leak_adv") advantage = tfkl.Dense(self.action_size, name="adv")(advantage) + advantage_mean = tf.math.reduce_mean(advantage, axis=1, + keepdims=True, name="adv_mean") + advantage = tfkl.subtract([advantage, advantage_mean], name="adv_sub") value = tfkl.Dense(64, name="fc_val")(lstm_output) value = tf.nn.leaky_relu(value, alpha=0.01, name="leak_val") value = tfkl.Dense(1, name="val")(value) - advantage_mean = tf.math.reduce_mean(advantage, axis=1, keepdims=True, name="adv_mean") - advantage = tfkl.subtract([advantage, advantage_mean], name="adv_sub") Q = tf.math.add(value, advantage, name="Qout") # Backwards pass - self.model = tfk.Model(inputs=[input_mem_state, input_carry_state, input_layer], - outputs=[Q, mem_s, carry_s], + model_inputs = [input_mem_state, input_carry_state, input_layer] + model_outputs = [Q, mem_s, carry_s] + self.model = tfk.Model(inputs=model_inputs, + outputs=model_outputs, name=self.__class__.__name__) losses = [ - self._clipped_mse_loss, + self._mse_loss, self._no_loss, self._no_loss ] - self.model.compile(loss=losses, optimizer=tfko.Adam(lr=self.lr, clipnorm=1.0)) + self.optimizer = tfko.Adam(lr=self.lr, clipnorm=1.0) + self.model.compile(loss=losses, optimizer=self.optimizer) def _no_loss(self, y_true, y_pred): return 0.0 - def _clipped_mse_loss(self, Qnext, Q): + def _mse_loss(self, Qnext, Q): loss = tf.math.reduce_mean(tf.math.square(Qnext - Q), name="loss_mse") - clipped_loss = tf.clip_by_value(loss, 0.0, 1e3, name="loss_clip") - return clipped_loss + return loss def bayesian_move(self, data, mem, carry, rate = 0.0): self.dropout_rate.assign(float(rate)) From 58366b1a2146815c5597d1c186e9fec6100aafd3 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 4 Jun 2020 09:46:54 +0200 Subject: [PATCH 06/24] Update WG head --- l2rpn_baselines/WolperGrid | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid index eae7b06..01b698f 160000 --- a/l2rpn_baselines/WolperGrid +++ b/l2rpn_baselines/WolperGrid @@ -1 +1 @@ -Subproject commit eae7b06f89cb4ce283a4f8dc4d283578cdfa13f2 +Subproject commit 01b698f32ebac2f1fce2a91eeee5af97ff9c9a0a From 26d08b85d7505c573ff76009ee3da90ddd4dd89d Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 4 Jun 2020 13:49:40 +0200 Subject: [PATCH 07/24] D3QN: Do not filter action space forn now. Use very small soft update --- .../DoubleDuelingDQN/DoubleDuelingDQN.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index 9ec8721..b32b84e 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -22,14 +22,13 @@ INITIAL_EPSILON = 0.99 FINAL_EPSILON = 0.001 DECAY_EPSILON = 1024*64 -DISCOUNT_FACTOR = 0.98 +DISCOUNT_FACTOR = 0.99 PER_CAPACITY = 1024*64 PER_ALPHA = 0.7 PER_BETA = 0.5 UPDATE_FREQ = 28 UPDATE_TARGET_HARD_FREQ = -1 -UPDATE_TARGET_SOFT_TAU = 1e-3 - +UPDATE_TARGET_SOFT_TAU = 1e-4 class DoubleDuelingDQN(AgentWithConverter): def __init__(self, @@ -46,9 +45,9 @@ def __init__(self, self.obs_space = observation_space # Filter - print("Actions filtering...") - self.action_space.filter_action(self._filter_action) - print("..Done") + #print("Actions filtering...") + #self.action_space.filter_action(self._filter_action) + #print("..Done") # Store constructor params self.name = name @@ -234,17 +233,12 @@ def train(self, env, modelpath = os.path.join(save_path, self.name + ".h5") self.tf_writer = tf.summary.create_file_writer(logpath, name=self.name) self._save_hyperparameters(save_path, env, num_steps) - + # Training loop while step < num_steps: # Init first time or new episode if self.done: new_obs = env.reset() # This shouldn't raise - # Random fast forward somewhere in the day - #ff_rand = np.random.randint(0, 12*24) - #env.fast_forward_chronics(ff_rand) - # Reset internal state - #new_obs = env.current_obs self.reset(new_obs) if step % 1000 == 0: print("Step [{}] -- Random [{}]".format(step, self.epsilon)) From 0ac92a84db945f84dd7dc2ab312dce92769d9a5f Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 9 Jun 2020 19:20:08 +0200 Subject: [PATCH 08/24] D3QN: Exposes hypers as D3QNConfig class attributes --- .../DoubleDuelingDQN/DoubleDuelingDQN.py | 87 ++++++++----------- .../DoubleDuelingDQNConfig.py | 39 +++++++++ .../DoubleDuelingDQN/DoubleDuelingDQN_NN.py | 8 +- l2rpn_baselines/DoubleDuelingDQN/__init__.py | 2 + l2rpn_baselines/DoubleDuelingDQN/train.py | 26 +++--- 5 files changed, 98 insertions(+), 64 deletions(-) create mode 100644 l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index b32b84e..ba19d13 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -14,31 +14,17 @@ from grid2op.Agent import AgentWithConverter from grid2op.Converter import IdToAct +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg + from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN from l2rpn_baselines.DoubleDuelingDQN.prioritized_replay_buffer import PrioritizedReplayBuffer -LR_DECAY_STEPS = 1024*64 -LR_DECAY_RATE = 0.95 -INITIAL_EPSILON = 0.99 -FINAL_EPSILON = 0.001 -DECAY_EPSILON = 1024*64 -DISCOUNT_FACTOR = 0.99 -PER_CAPACITY = 1024*64 -PER_ALPHA = 0.7 -PER_BETA = 0.5 -UPDATE_FREQ = 28 -UPDATE_TARGET_HARD_FREQ = -1 -UPDATE_TARGET_SOFT_TAU = 1e-4 - class DoubleDuelingDQN(AgentWithConverter): def __init__(self, observation_space, action_space, name=__name__, - num_frames=4, - is_training=False, - batch_size=32, - lr=1e-5): + is_training=False): # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) @@ -46,15 +32,15 @@ def __init__(self, # Filter #print("Actions filtering...") - #self.action_space.filter_action(self._filter_action) + self.action_space.filter_action(self._filter_action) #print("..Done") # Store constructor params self.name = name - self.num_frames = num_frames + self.num_frames = cfg.N_FRAMES self.is_training = is_training - self.batch_size = batch_size - self.lr = lr + self.batch_size = cfg.BATCH_SIZE + self.lr = cfg.LR # Declare required vars self.Qmain = None @@ -80,8 +66,8 @@ def __init__(self, self.observation_size, num_frames=self.num_frames, learning_rate=self.lr, - learning_rate_decay_steps=LR_DECAY_STEPS, - learning_rate_decay_rate=LR_DECAY_RATE) + learning_rate_decay_steps=cfg.LR_DECAY_STEPS, + learning_rate_decay_rate=cfg.LR_DECAY_RATE) # Setup training vars if needed if self.is_training: self._init_training() @@ -103,11 +89,11 @@ def _filter_action(self, action): return False def _init_training(self): - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON self.frames2 = [] self.epoch_rewards = [] self.epoch_alive = [] - self.per_buffer = PrioritizedReplayBuffer(PER_CAPACITY, PER_ALPHA) + self.per_buffer = PrioritizedReplayBuffer(cfg.PER_CAPACITY, cfg.PER_ALPHA) self.Qtarget = DoubleDuelingDQN_NN(self.action_size, self.observation_size, num_frames = self.num_frames) @@ -135,32 +121,32 @@ def _save_next_frame(self, next_state): self.frames2.pop(0) def _adaptive_epsilon_decay(self, step): - ada_div = DECAY_EPSILON / 10.0 + ada_div = cfg.DECAY_EPSILON / 10.0 step_off = step + ada_div - ada_eps = INITIAL_EPSILON * -math.log10((step_off + 1) / (DECAY_EPSILON + ada_div)) - ada_eps_up_clip = min(INITIAL_EPSILON, ada_eps) - ada_eps_low_clip = max(FINAL_EPSILON, ada_eps_up_clip) + ada_eps = cfg.INITIAL_EPSILON * -math.log10((step_off + 1) / (cfg.DECAY_EPSILON + ada_div)) + ada_eps_up_clip = min(cfg.INITIAL_EPSILON, ada_eps) + ada_eps_low_clip = max(cfg.FINAL_EPSILON, ada_eps_up_clip) return ada_eps_low_clip def _save_hyperparameters(self, logpath, env, steps): r_instance = env.reward_helper.template_reward hp = { "lr": self.lr, - "lr_decay_steps": LR_DECAY_STEPS, - "lr_decay_rate": LR_DECAY_RATE, + "lr_decay_steps": cfg.LR_DECAY_STEPS, + "lr_decay_rate": cfg.LR_DECAY_RATE, "batch_size": self.batch_size, "stack_frames": self.num_frames, "iter": steps, - "e_start": INITIAL_EPSILON, - "e_end": FINAL_EPSILON, - "e_decay": DECAY_EPSILON, - "discount": DISCOUNT_FACTOR, - "per_alpha": PER_ALPHA, - "per_beta": PER_BETA, - "per_capacity": PER_CAPACITY, - "update_freq": UPDATE_FREQ, - "update_hard": UPDATE_TARGET_HARD_FREQ, - "update_soft": UPDATE_TARGET_SOFT_TAU, + "e_start": cfg.INITIAL_EPSILON, + "e_end": cfg.FINAL_EPSILON, + "e_decay": cfg.DECAY_EPSILON, + "discount": cfg.DISCOUNT_FACTOR, + "per_alpha": cfg.PER_ALPHA, + "per_beta": cfg.PER_BETA, + "per_capacity": cfg.PER_CAPACITY, + "update_freq": cfg.UPDATE_FREQ, + "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, + "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) @@ -222,7 +208,7 @@ def train(self, env, num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 self.done = True @@ -282,16 +268,19 @@ def train(self, env, self.epsilon = self._adaptive_epsilon_decay(training_step) # Perform training at given frequency - if step % UPDATE_FREQ == 0 and len(self.per_buffer) >= self.batch_size: + if step % cfg.UPDATE_FREQ == 0 and \ + len(self.per_buffer) >= self.batch_size: # Perform training self._batch_train(training_step, step) - if UPDATE_TARGET_SOFT_TAU > 0.0: + if cfg.UPDATE_TARGET_SOFT_TAU > 0.0: + tau = cfg.UPDATE_TARGET_SOFT_TAU # Update target network towards primary network - self.Qmain.update_target_soft(self.Qtarget.model, tau=UPDATE_TARGET_SOFT_TAU) + self.Qmain.update_target_soft(self.Qtarget.model, tau) # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely - if UPDATE_TARGET_HARD_FREQ > 0 and step % (UPDATE_FREQ * UPDATE_TARGET_HARD_FREQ) == 0: + if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ + step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward @@ -322,7 +311,7 @@ def _batch_train(self, training_step, step): """Trains network to fit given parameters""" # Sample from experience buffer - sample_batch = self.per_buffer.sample(self.batch_size, PER_BETA) + sample_batch = self.per_buffer.sample(self.batch_size, cfg.PER_BETA) s_batch = sample_batch[0] a_batch = sample_batch[1] r_batch = sample_batch[2] @@ -359,7 +348,7 @@ def _batch_train(self, training_step, step): doubleQ = Q2[i, np.argmax(Q1[i])] Q[i, a_batch[i]] = r_batch[i] if d_batch[i] == False: - Q[i, a_batch[i]] += DISCOUNT_FACTOR * doubleQ + Q[i, a_batch[i]] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train loss = self.Qmain.train_on_batch(input_t, Q, w_batch) @@ -371,7 +360,7 @@ def _batch_train(self, training_step, step): self.per_buffer.update_priorities(idx_batch, priorities) # Log some useful metrics every even updates - if step % (UPDATE_FREQ * 2) == 0: + if step % (cfg.UPDATE_FREQ * 2) == 0: with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py new file mode 100644 index 0000000..35164c2 --- /dev/null +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py @@ -0,0 +1,39 @@ +import os +import json + +class DoubleDuelingDQNConfig(): + LR_DECAY_STEPS = 1024*64 + LR_DECAY_RATE = 0.95 + INITIAL_EPSILON = 0.99 + FINAL_EPSILON = 0.001 + DECAY_EPSILON = 1024*64 + DISCOUNT_FACTOR = 0.98 + PER_CAPACITY = 1024*64 + PER_ALPHA = 0.7 + PER_BETA = 0.5 + UPDATE_FREQ = 28 + UPDATE_TARGET_HARD_FREQ = -1 + UPDATE_TARGET_SOFT_TAU = 1e-3 + N_FRAMES = 4 + BATCH_SIZE = 32 + LR = 1e-5 + + @staticmethod + def from_json(json_in_path): + with open(json_in_path, 'r') as fp: + conf_json = json.load(fp) + + for k,v in conf_json.items(): + if hasattr(DoubleDuelingDQNConfig, k): + setattr(DoubleDuelingDQNConfig, k, v) + + @staticmethod + def to_json(json_out_path): + conf_json = {} + for attr in dir(DoubleDuelingDQNConfig): + if attr.startswith('__') or callable(attr): + continue + conf_json[attr] = getattr(DoubleDuelingDQNConfig, attr) + + with open(json_out_path, 'w+') as fp: + json.dump(fp, conf_json, indent=2) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py index 16701eb..c33db0c 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py @@ -44,13 +44,13 @@ def construct_q_network(self): lay2 = tfkl.Dense(self.observation_size, name="fc_2")(lay1) lay2 = tfka.relu(lay2, alpha=0.01) #leaky_relu - lay3 = tfkl.Dense(self.action_size * 2 + 512, name="fc_3")(lay2) + lay3 = tfkl.Dense(896, name="fc_3")(lay2) lay3 = tfka.relu(lay3, alpha=0.01) #leaky_relu - lay4 = tfkl.Dense(self.action_size * 2 + 256, name="fc_4")(lay3) + lay4 = tfkl.Dense(512, name="fc_4")(lay3) lay4 = tfka.relu(lay4, alpha=0.01) #leaky_relu - advantage = tfkl.Dense(self.action_size * 2, name="fc_adv")(lay4) + advantage = tfkl.Dense(384, name="fc_adv")(lay4) advantage = tfka.relu(advantage, alpha=0.01) #leaky_relu advantage = tfkl.Dense(self.action_size, name="adv")(advantage) advantage_mean = tf.math.reduce_mean(advantage, @@ -59,7 +59,7 @@ def construct_q_network(self): advantage = tfkl.subtract([advantage, advantage_mean], name="adv_subtract") - value = tfkl.Dense(self.action_size * 2, name="fc_val")(lay4) + value = tfkl.Dense(384, name="fc_val")(lay4) value = tfka.relu(value, alpha=0.01) #leaky_relu value = tfkl.Dense(1, name="val")(value) diff --git a/l2rpn_baselines/DoubleDuelingDQN/__init__.py b/l2rpn_baselines/DoubleDuelingDQN/__init__.py index f5ced0d..d309b42 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/__init__.py +++ b/l2rpn_baselines/DoubleDuelingDQN/__init__.py @@ -1,9 +1,11 @@ __all__ = [ "DoubleDuelingDQN", + "DoubleDuelingDQNConfig", "evaluate", "train" ] from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig from l2rpn_baselines.DoubleDuelingDQN.evaluate import evaluate from l2rpn_baselines.DoubleDuelingDQN.train import train diff --git a/l2rpn_baselines/DoubleDuelingDQN/train.py b/l2rpn_baselines/DoubleDuelingDQN/train.py index 8085016..6d5a4de 100755 --- a/l2rpn_baselines/DoubleDuelingDQN/train.py +++ b/l2rpn_baselines/DoubleDuelingDQN/train.py @@ -11,7 +11,8 @@ import argparse import tensorflow as tf -from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as DDDQNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as D3QNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig DEFAULT_NAME = "DoubleDuelingDQN" DEFAULT_SAVE_DIR = "./models" @@ -68,18 +69,20 @@ def train(env, batch_size= DEFAULT_BATCH_SIZE, learning_rate= DEFAULT_LR): + # Set config + D3QNConfig.LR = learning_rate + D3QNConfig.N_FRAMES = num_frames + D3QNConfig.BATCH_SIZE = batch_size + # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices) > 0: tf.config.experimental.set_memory_growth(physical_devices[0], True) - agent = DDDQNAgent(env.observation_space, - env.action_space, - name=name, - is_training=True, - batch_size=batch_size, - num_frames=num_frames, - lr=learning_rate) + agent = D3QNAgent(env.observation_space, + env.action_space, + name=name, + is_training=True) if load_path is not None: agent.load(load_path) @@ -110,9 +113,10 @@ def train(env, # Register custom reward for training cr = env.reward_helper.template_reward - cr.addReward("overflow", CloseToOverflowReward(), 1.0) - cr.addReward("game", GameplayReward(), 2.0) - cr.addReward("recolines", LinesReconnectedReward(), 1.0) + #cr.addReward("overflow", CloseToOverflowReward(), 1.0) + cr.addReward("game", GameplayReward(), 1.0) + #cr.addReward("recolines", LinesReconnectedReward(), 1.0) + cr.addReward("l2rpn", L2RPNReward(), 2.0/float(env.n_line)) # Initialize custom rewards cr.initialize(env) # Set reward range to something managable From 6876502729531a651efb09c99130d61c9a54dd53 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 9 Jun 2020 19:20:28 +0200 Subject: [PATCH 09/24] WolperGrid: Update Head --- l2rpn_baselines/WolperGrid | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid index 01b698f..e7c400a 160000 --- a/l2rpn_baselines/WolperGrid +++ b/l2rpn_baselines/WolperGrid @@ -1 +1 @@ -Subproject commit 01b698f32ebac2f1fce2a91eeee5af97ff9c9a0a +Subproject commit e7c400a521ab45e9e1a4ee0c503b46eb748226eb From 1b7de8a37befb9f04a30e1a0f7f2f02199db42a7 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 9 Jun 2020 19:21:24 +0200 Subject: [PATCH 10/24] D3QN indent --- l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py | 1 - 1 file changed, 1 deletion(-) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index ba19d13..2fa4260 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -15,7 +15,6 @@ from grid2op.Converter import IdToAct from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg - from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN from l2rpn_baselines.DoubleDuelingDQN.prioritized_replay_buffer import PrioritizedReplayBuffer From 9f7f7d0d23380fc966bf467e670a0e5beb98c205 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Wed, 10 Jun 2020 09:34:20 +0200 Subject: [PATCH 11/24] D3QN update eval with configurable hypers --- .../DoubleDuelingDQN/DoubleDuelingDQN.py | 6 +++--- l2rpn_baselines/DoubleDuelingDQN/evaluate.py | 13 ++++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index 2fa4260..782b96c 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -130,11 +130,11 @@ def _adaptive_epsilon_decay(self, step): def _save_hyperparameters(self, logpath, env, steps): r_instance = env.reward_helper.template_reward hp = { - "lr": self.lr, + "lr": cfg.LR, "lr_decay_steps": cfg.LR_DECAY_STEPS, "lr_decay_rate": cfg.LR_DECAY_RATE, - "batch_size": self.batch_size, - "stack_frames": self.num_frames, + "batch_size": cfg.BATCH_SIZE, + "stack_frames": cfg.N_FRAMES, "iter": steps, "e_start": cfg.INITIAL_EPSILON, "e_end": cfg.FINAL_EPSILON, diff --git a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py index fc9409e..ab357a0 100755 --- a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py +++ b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py @@ -17,7 +17,8 @@ from grid2op.Reward import * from grid2op.Action import * -from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as DDDQNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as D3QNAgent +from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig from l2rpn_baselines.utils.save_log_gif import save_log_gif DEFAULT_LOGS_DIR = "./logs-evals" @@ -65,6 +66,9 @@ def evaluate(env, verbose=False, save_gif=False): + # Set config + D3QNConfig.N_FRAMES = num_frames + # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], True) @@ -73,10 +77,9 @@ def evaluate(env, runner_params["verbose"] = args.verbose # Create agent - agent = DDDQNAgent(env.observation_space, - env.action_space, - is_training=False, - num_frames=num_frames) + agent = D3QNAgent(env.observation_space, + env.action_space, + is_training=False) # Load weights from file agent.load(load_path) From 3238af4f584011d0a587c1b57955714144e9430e Mon Sep 17 00:00:00 2001 From: Tezirg Date: Wed, 10 Jun 2020 10:02:24 +0200 Subject: [PATCH 12/24] RDQN: Adds configurable hyperparameters --- .../DoubleDuelingRDQN/DoubleDuelingRDQN.py | 122 ++++++++++-------- l2rpn_baselines/DoubleDuelingRDQN/__init__.py | 2 + l2rpn_baselines/DoubleDuelingRDQN/evaluate.py | 2 +- l2rpn_baselines/DoubleDuelingRDQN/train.py | 11 +- 4 files changed, 80 insertions(+), 57 deletions(-) diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py index 1affec6..899e85b 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py @@ -16,28 +16,16 @@ from grid2op.Agent import AgentWithConverter from grid2op.Converter import IdToAct +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as cfg from l2rpn_baselines.DoubleDuelingRDQN.ExperienceBuffer import ExperienceBuffer from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN import DoubleDuelingRDQN_NN -INITIAL_EPSILON = 0.99 -FINAL_EPSILON = 0.01 -DECAY_EPSILON = 1024*32 -STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON -DISCOUNT_FACTOR = 0.99 -REPLAY_BUFFER_SIZE = 1024*4 -UPDATE_FREQ = 64 -UPDATE_TARGET_HARD_FREQ = -1 -UPDATE_TARGET_SOFT_TAU = 0.001 - class DoubleDuelingRDQN(AgentWithConverter): def __init__(self, observation_space, action_space, name=__name__, - trace_length=1, - batch_size=1, - is_training=False, - lr=1e-5): + is_training=False): # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) @@ -45,10 +33,10 @@ def __init__(self, # Store constructor params self.observation_space = observation_space self.name = name - self.trace_length = trace_length - self.batch_size = batch_size + self.trace_length = cfg.TRACE_LENGTH + self.batch_size = cfg.BATCH_SIZE self.is_training = is_training - self.lr = lr + self.lr = cfg.LR # Declare required vars self.Qmain = None @@ -78,7 +66,9 @@ def __init__(self, def _init_training(self): - self.exp_buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE, self.batch_size, self.trace_length) + self.exp_buffer = ExperienceBuffer(cfg.REPLAY_BUFFER_SIZE, + self.batch_size, + self.trace_length) self.done = True self.epoch_rewards = [] self.epoch_alive = [] @@ -110,17 +100,17 @@ def _register_experience(self, episode_exp, episode): def _save_hyperparameters(self, logpath, env, steps): r_instance = env.reward_helper.template_reward hp = { - "lr": self.lr, - "batch_size": self.batch_size, - "trace_len": self.trace_length, - "e_start": INITIAL_EPSILON, - "e_end": FINAL_EPSILON, - "e_decay": DECAY_EPSILON, - "discount": DISCOUNT_FACTOR, - "buffer_size": REPLAY_BUFFER_SIZE, - "update_freq": UPDATE_FREQ, - "update_hard": UPDATE_TARGET_HARD_FREQ, - "update_soft": UPDATE_TARGET_SOFT_TAU, + "lr": cfg.LR, + "batch_size": cfg.BATCH_SIZE, + "trace_len": cfg.TRACE_LENGTH, + "e_start": cfg.INITIAL_EPSILON, + "e_end": cfg.FINAL_EPSILON, + "e_decay": cfg.DECAY_EPSILON, + "discount": cfg.DISCOUNT_FACTOR, + "buffer_size": cfg.REPLAY_BUFFER_SIZE, + "update_freq": cfg.UPDATE_FREQ, + "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, + "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) @@ -153,7 +143,9 @@ def reset(self, observation): def my_act(self, state, reward, done=False): data_input = np.array(state) data_input.reshape(1, 1, self.observation_size) - a, _, m, c = self.Qmain.predict_move(data_input, self.mem_state, self.carry_state) + a, _, m, c = self.Qmain.predict_move(data_input, + self.mem_state, + self.carry_state) self.mem_state = m self.carry_state = c @@ -178,7 +170,7 @@ def train(self, env, num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 - epsilon = INITIAL_EPSILON + epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 episode = 0 @@ -209,12 +201,19 @@ def train(self, env, # Choose an action if step <= num_pre_training_steps: - a, m, c = self.Qmain.random_move(self.state, self.mem_state, self.carry_state) + a, m, c = self.Qmain.random_move(self.state, + self.mem_state, + self.carry_state) elif len(episode_exp) < self.trace_length: - a, m, c = self.Qmain.random_move(self.state, self.mem_state, self.carry_state) + a, m, c = self.Qmain.random_move(self.state, + self.mem_state, + self.carry_state) a = 0 # Do Nothing else: - a, _, m, c = self.Qmain.bayesian_move(self.state, self.mem_state, self.carry_state, epsilon) + a, _, m, c = self.Qmain.bayesian_move(self.state, + self.mem_state, + self.carry_state, + epsilon) # Update LSTM state self.mem_state = m @@ -233,23 +232,27 @@ def train(self, env, if step >= num_pre_training_steps: training_step = step - num_pre_training_steps # Slowly decay dropout rate - if epsilon > FINAL_EPSILON: - epsilon -= STEP_EPSILON - if epsilon < FINAL_EPSILON: - epsilon = FINAL_EPSILON + if epsilon > cfg.FINAL_EPSILON: + epsilon -= cfg.STEP_EPSILON + if epsilon < cfg.FINAL_EPSILON: + epsilon = cfg.FINAL_EPSILON # Perform training at given frequency - if step % UPDATE_FREQ == 0 and self.exp_buffer.can_sample(): + if step % cfg.UPDATE_FREQ == 0 and \ + self.exp_buffer.can_sample(): # Sample from experience buffer batch = self.exp_buffer.sample() # Perform training self._batch_train(batch, step, training_step) # Update target network towards primary network - if UPDATE_TARGET_SOFT_TAU > 0: - self.Qmain.update_target_soft(self.Qtarget.model, tau=UPDATE_TARGET_SOFT_TAU) - - # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely - if UPDATE_TARGET_HARD_FREQ > 0 and step % (UPDATE_FREQ * UPDATE_TARGET_HARD_FREQ) == 0: + if cfg.UPDATE_TARGET_SOFT_TAU > 0: + tau = cfg.UPDATE_TARGET_SOFT_TAU + self.Qmain.update_target_soft(self.Qtarget.model, tau) + + # Every UPDATE_TARGET_HARD_FREQ trainings, + # update target completely + if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ + step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward @@ -286,9 +289,21 @@ def _batch_train(self, batch, step, training_step): m_data = m_data.reshape(self.batch_size, self.trace_length, input_size) t_data = np.vstack(batch[:, 4]) t_data = t_data.reshape(self.batch_size, self.trace_length, input_size) - q_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(m_data)] - q1_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] - q2_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] + q_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(m_data) + ] + q1_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] + q2_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] # Batch predict self.Qmain.trace_length.assign(self.trace_length) @@ -301,7 +316,8 @@ def _batch_train(self, batch, step, training_step): tf.summary.trace_on() # T Batch predict - Q, _, _ = self.Qmain.model.predict(q_input, batch_size = self.batch_size) + Q, _, _ = self.Qmain.model.predict(q_input, + batch_size = self.batch_size) ## Log graph once and disable graph logging if training_step == 0: @@ -309,8 +325,10 @@ def _batch_train(self, batch, step, training_step): tf.summary.trace_export(self.name + "-graph", step) # T+1 batch predict - Q1, _, _ = self.Qmain.model.predict(q1_input, batch_size=self.batch_size) - Q2, _, _ = self.Qtarget.model.predict(q2_input, batch_size=self.batch_size) + Q1, _, _ = self.Qmain.model.predict(q1_input, + batch_size=self.batch_size) + Q2, _, _ = self.Qtarget.model.predict(q2_input, + batch_size=self.batch_size) # Compute batch Double Q update to Qtarget for i in range(self.batch_size): @@ -321,7 +339,7 @@ def _batch_train(self, batch, step, training_step): d = batch[idx][3] Q[i, a] = r if d == False: - Q[i, a] += DISCOUNT_FACTOR * doubleQ + Q[i, a] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train batch_x = [batch_mem, batch_carry, m_data] @@ -330,7 +348,7 @@ def _batch_train(self, batch, step, training_step): loss = loss[0] # Log some useful metrics - if step % (UPDATE_FREQ * 2) == 0: + if step % (cfg.UPDATE_FREQ * 2) == 0: print("loss =", loss) with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) diff --git a/l2rpn_baselines/DoubleDuelingRDQN/__init__.py b/l2rpn_baselines/DoubleDuelingRDQN/__init__.py index bf1698c..085c33f 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/__init__.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/__init__.py @@ -1,10 +1,12 @@ __all__ = [ "DoubleDuelingRDQN", + "DoubleDuelingRDQNConfig", "evaluate", "train" ] from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig from l2rpn_baselines.DoubleDuelingRDQN.evaluate import evaluate from l2rpn_baselines.DoubleDuelingRDQN.train import train diff --git a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py index 105006f..1b07372 100755 --- a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py @@ -17,6 +17,7 @@ from grid2op.Reward import * from grid2op.Action import * +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent from l2rpn_baselines.utils.save_log_gif import save_log_gif @@ -25,7 +26,6 @@ DEFAULT_NB_PROCESS = 1 DEFAULT_MAX_STEPS = -1 - def cli(): parser = argparse.ArgumentParser(description="Eval baseline DDDQN") parser.add_argument("--data_dir", required=True, diff --git a/l2rpn_baselines/DoubleDuelingRDQN/train.py b/l2rpn_baselines/DoubleDuelingRDQN/train.py index 1696129..582a466 100755 --- a/l2rpn_baselines/DoubleDuelingRDQN/train.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/train.py @@ -15,6 +15,7 @@ from grid2op.Reward import * from grid2op.Action import * +from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent DEFAULT_NAME = "DoubleDuelingRDQN" @@ -74,6 +75,11 @@ def train(env, batch_size=DEFAULT_BATCH_SIZE, learning_rate=DEFAULT_LR): + # Set config + RDQNConfig.TRACE_LENGTH = trace_length + RDQNConfig.BATCH_SIZE = batch_size + RDQNConfig.LR = learning_rate + # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices) > 0: @@ -82,10 +88,7 @@ def train(env, agent = RDQNAgent(env.observation_space, env.action_space, name=name, - is_training=True, - batch_size=batch_size, - trace_length=trace_length, - lr=learning_rate) + is_training=True) if load_path is not None: agent.load(load_path) From 81419e7d03a82c5742e823ecdb67761c3b877465 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Wed, 10 Jun 2020 10:02:48 +0200 Subject: [PATCH 13/24] RDQN: Adds config class --- .../DoubleDuelingRDQNConfig.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py new file mode 100644 index 0000000..108a488 --- /dev/null +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py @@ -0,0 +1,36 @@ +import os +import json + +class DoubleDuelingRDQNConfig(): + INITIAL_EPSILON = 0.99 + FINAL_EPSILON = 0.01 + DECAY_EPSILON = 1024*32 + STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON + DISCOUNT_FACTOR = 0.99 + REPLAY_BUFFER_SIZE = 1024*4 + UPDATE_FREQ = 64 + UPDATE_TARGET_HARD_FREQ = -1 + UPDATE_TARGET_SOFT_TAU = 0.001 + TRACE_LENGTH = 8 + BATCH_SIZE = 32 + LR = 1e-5 + + @staticmethod + def from_json(json_in_path): + with open(json_in_path, 'r') as fp: + conf_json = json.load(fp) + + for k,v in conf_json.items(): + if hasattr(DoubleDuelingDQNConfig, k): + setattr(DoubleDuelingDQNConfig, k, v) + + @staticmethod + def to_json(json_out_path): + conf_json = {} + for attr in dir(DoubleDuelingDQNConfig): + if attr.startswith('__') or callable(attr): + continue + conf_json[attr] = getattr(DoubleDuelingDQNConfig, attr) + + with open(json_out_path, 'w+') as fp: + json.dump(fp, conf_json, indent=2) From 33dbb976ebe57e44de6b793e77b431644334b493 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 11 Jun 2020 10:45:18 +0200 Subject: [PATCH 14/24] Update WolperGrid head --- l2rpn_baselines/WolperGrid | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid index e7c400a..2d782ee 160000 --- a/l2rpn_baselines/WolperGrid +++ b/l2rpn_baselines/WolperGrid @@ -1 +1 @@ -Subproject commit e7c400a521ab45e9e1a4ee0c503b46eb748226eb +Subproject commit 2d782eef2417c07db35fa13552fd4d5bc0418d37 From ba3baf2f0b57650a199f879cc867aae6718889c8 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 11 Jun 2020 10:56:45 +0200 Subject: [PATCH 15/24] SRDQN: Configurable hyperparams --- l2rpn_baselines/SliceRDQN/SliceRDQN.py | 126 ++++++++++-------- l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py | 38 ++++++ l2rpn_baselines/SliceRDQN/__init__.py | 2 + l2rpn_baselines/SliceRDQN/train.py | 14 +- 4 files changed, 120 insertions(+), 60 deletions(-) create mode 100644 l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN.py b/l2rpn_baselines/SliceRDQN/SliceRDQN.py index 3d3fdc4..ccc2e24 100644 --- a/l2rpn_baselines/SliceRDQN/SliceRDQN.py +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN.py @@ -17,30 +17,17 @@ from grid2op.Converter import IdToAct from l2rpn_baselines.SliceRDQN.ExperienceBuffer import ExperienceBuffer +from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config as cfg from l2rpn_baselines.SliceRDQN.SliceRDQN_NN import SliceRDQN_NN from l2rpn_baselines.SliceRDQN.slice_util import * -INITIAL_EPSILON = 0.80 -FINAL_EPSILON = 0.01 -DECAY_EPSILON = 1024*256 -STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON -DISCOUNT_FACTOR = 0.99 -REPLAY_BUFFER_SIZE = 1024*8 -UPDATE_FREQ = 512 -UPDATE_TARGET_HARD_FREQ = -1 -UPDATE_TARGET_SOFT_TAU = 0.001 -INPUT_BIAS = 3.0 -SUFFLE_FREQ = 1000 class SliceRDQN(AgentWithConverter): def __init__(self, observation_space, action_space, name=__name__, - trace_length=1, - batch_size=1, - is_training=False, - lr=1e-5): + is_training=False): # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) @@ -48,10 +35,10 @@ def __init__(self, # Store constructor params self.observation_space = observation_space self.name = name - self.trace_length = trace_length - self.batch_size = batch_size + self.trace_length = cfg.TRACE_LENGTH + self.batch_size = cfg.BATCH_SIZE self.is_training = is_training - self.lr = lr + self.lr = cfg.LR # Declare required vars self.Qmain = None @@ -66,7 +53,7 @@ def __init__(self, self.epoch_rewards = None self.epoch_alive = None self.Qtarget = None - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON # Compute dimensions from intial state self.action_size = self.action_space.n @@ -100,7 +87,9 @@ def __init__(self, def _init_training(self): - self.exp_buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE, self.batch_size, self.trace_length) + self.exp_buffer = ExperienceBuffer(cfg.REPLAY_BUFFER_SIZE, + self.batch_size, + self.trace_length) self.done = True self.epoch_rewards = [] self.epoch_alive = [] @@ -136,15 +125,15 @@ def _save_hyperparameters(self, logpath, env, steps): "lr": self.lr, "batch_size": self.batch_size, "trace_len": self.trace_length, - "e_start": INITIAL_EPSILON, - "e_end": FINAL_EPSILON, - "e_decay": DECAY_EPSILON, - "discount": DISCOUNT_FACTOR, - "buffer_size": REPLAY_BUFFER_SIZE, - "update_freq": UPDATE_FREQ, - "update_hard": UPDATE_TARGET_HARD_FREQ, - "update_soft": UPDATE_TARGET_SOFT_TAU, - "input_bias": INPUT_BIAS, + "e_start": cfg.INITIAL_EPSILON, + "e_end": cfg.FINAL_EPSILON, + "e_decay": cfg.DECAY_EPSILON, + "discount": cfg.DISCOUNT_FACTOR, + "buffer_size": cfg.REPLAY_BUFFER_SIZE, + "update_freq": cfg.UPDATE_FREQ, + "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, + "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, + "input_bias": cfg.INPUT_BIAS, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) @@ -154,7 +143,7 @@ def _save_hyperparameters(self, logpath, env, steps): ## Agent Interface def convert_obs(self, observation): - return convert_obs_pad(observation, bias=INPUT_BIAS) + return convert_obs_pad(observation, bias=cfg.INPUT_BIAS) def convert_act(self, action): return super().convert_act(action) @@ -191,7 +180,7 @@ def train(self, env, num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 - self.epsilon = INITIAL_EPSILON + self.epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 episode = 0 @@ -209,10 +198,16 @@ def train(self, env, while step < num_steps: # New episode if self.done: - if episode % SUFFLE_FREQ == 0: + if episode % cfg.SUFFLE_FREQ == 0: # shuffle the data every now and then - env.chronics_handler.shuffle( - shuffler=lambda x: x[np.random.choice(len(x), size=len(x), replace=False)]) + def shuff(x): + s = np.random.choice(len(x), + size=len(x), + replace=False) + return x[s] + + env.chronics_handler.shuffle(shuffler=shuff) + new_obs = env.reset() # This shouldn't raise self.reset(new_obs) # Push current episode experience to experience buffer @@ -221,7 +216,7 @@ def train(self, env, episode += 1 episode_exp = [] - if step % SUFFLE_FREQ == 0: + if step % cfg.SUFFLE_FREQ == 0: print("Step [{}] -- Dropout [{}]".format(step, self.epsilon)) # Choose an action @@ -252,26 +247,27 @@ def train(self, env, if step >= num_pre_training_steps: training_step = step - num_pre_training_steps # Slowly decay dropout rate - if self.epsilon > FINAL_EPSILON: - self.epsilon -= STEP_EPSILON - if self.epsilon < FINAL_EPSILON: - self.epsilon = FINAL_EPSILON + if self.epsilon > cfg.FINAL_EPSILON: + self.epsilon -= cfg.STEP_EPSILON + if self.epsilon < cfg.FINAL_EPSILON: + self.epsilon = cfg.FINAL_EPSILON # Perform training at given frequency - if step % UPDATE_FREQ == 0 and self.exp_buffer.can_sample(): + if step % cfg.UPDATE_FREQ == 0 and \ + self.exp_buffer.can_sample(): # Sample from experience buffer batch = self.exp_buffer.sample() # Perform training self._batch_train(batch, training_step, step) # Update target network towards primary network - if UPDATE_TARGET_SOFT_TAU > 0: - tau = UPDATE_TARGET_SOFT_TAU + if cfg.UPDATE_TARGET_SOFT_TAU > 0: + tau = cfg.UPDATE_TARGET_SOFT_TAU self.Qmain.update_target_soft(self.Qtarget.model, tau) # Every UPDATE_TARGET_HARD_FREQ trainings # update target completely - if UPDATE_TARGET_HARD_FREQ > 0 and \ - step % (UPDATE_FREQ * UPDATE_TARGET_HARD_FREQ) == 0: + if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ + step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward @@ -300,17 +296,34 @@ def train(self, env, def _batch_train(self, batch, training_step, step): """Trains network to fit given parameters""" Q = np.zeros((self.batch_size, self.action_size)) - batch_mem = np.zeros((self.batch_size, self.n_slices, self.Qmain.h_size)) - batch_carry = np.zeros((self.batch_size, self.n_slices, self.Qmain.h_size)) - - input_shape = (self.batch_size, self.trace_length) + self.observation_shape + batch_mem = np.zeros((self.batch_size, + self.n_slices, + self.Qmain.h_size)) + batch_carry = np.zeros((self.batch_size, + self.n_slices, + self.Qmain.h_size)) + + input_shape = (self.batch_size, + self.trace_length) + self.observation_shape m_data = np.vstack(batch[:, 0]) m_data = m_data.reshape(input_shape) t_data = np.vstack(batch[:, 4]) t_data = t_data.reshape(input_shape) - q_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(m_data)] - q1_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] - q2_input = [copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data)] + q_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(m_data) + ] + q1_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] + q2_input = [ + copy.deepcopy(batch_mem), + copy.deepcopy(batch_carry), + copy.deepcopy(t_data) + ] # Batch predict self.Qmain.trace_length.assign(self.trace_length) @@ -323,7 +336,8 @@ def _batch_train(self, batch, training_step, step): tf.summary.trace_on() # T batch predict - Q, _, _ = self.Qmain.model.predict(q_input, batch_size = self.batch_size) + Q, _, _ = self.Qmain.model.predict(q_input, + batch_size = self.batch_size) ## Log graph once and disable graph logging if training_step == 0: @@ -331,8 +345,10 @@ def _batch_train(self, batch, training_step, step): tf.summary.trace_export(self.name + "-graph", step) # T+1 batch predict - Q1, _, _ = self.Qmain.model.predict(q1_input, batch_size = self.batch_size) - Q2, _, _ = self.Qtarget.model.predict(q2_input, batch_size = self.batch_size) + Q1, _, _ = self.Qmain.model.predict(q1_input, + batch_size = self.batch_size) + Q2, _, _ = self.Qtarget.model.predict(q2_input, + batch_size = self.batch_size) # Compute batch Double Q update to Qtarget for i in range(self.batch_size): @@ -343,7 +359,7 @@ def _batch_train(self, batch, training_step, step): d = batch[idx][3] Q[i, a] = r if d == False: - Q[i, a] += DISCOUNT_FACTOR * doubleQ + Q[i, a] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train batch_x = [batch_mem, batch_carry, m_data] diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py new file mode 100644 index 0000000..16ffffa --- /dev/null +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py @@ -0,0 +1,38 @@ +import os +import json + +class SliceRDQN_Config(): + INITIAL_EPSILON = 0.80 + FINAL_EPSILON = 0.01 + DECAY_EPSILON = 1024*256 + STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON + DISCOUNT_FACTOR = 0.99 + REPLAY_BUFFER_SIZE = 1024*8 + UPDATE_FREQ = 512 + UPDATE_TARGET_HARD_FREQ = -1 + UPDATE_TARGET_SOFT_TAU = 0.001 + INPUT_BIAS = 3.0 + SUFFLE_FREQ = 1000 + TRACE_LENGTH = 8 + BATCH_SIZE = 32 + LR = 1e-5 + + @staticmethod + def from_json(json_in_path): + with open(json_in_path, 'r') as fp: + conf_json = json.load(fp) + + for k,v in conf_json.items(): + if hasattr(DoubleDuelingDQNConfig, k): + setattr(DoubleDuelingDQNConfig, k, v) + + @staticmethod + def to_json(json_out_path): + conf_json = {} + for attr in dir(DoubleDuelingDQNConfig): + if attr.startswith('__') or callable(attr): + continue + conf_json[attr] = getattr(DoubleDuelingDQNConfig, attr) + + with open(json_out_path, 'w+') as fp: + json.dump(fp, conf_json, indent=2) diff --git a/l2rpn_baselines/SliceRDQN/__init__.py b/l2rpn_baselines/SliceRDQN/__init__.py index dc89d89..b6e9fa3 100644 --- a/l2rpn_baselines/SliceRDQN/__init__.py +++ b/l2rpn_baselines/SliceRDQN/__init__.py @@ -1,10 +1,12 @@ __all__ = [ "SliceRDQN", + "SliceRDQN_Config", "evaluate", "train" ] from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN +from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config from l2rpn_baselines.SliceRDQN.evaluate import evaluate from l2rpn_baselines.SliceRDQN.train import train diff --git a/l2rpn_baselines/SliceRDQN/train.py b/l2rpn_baselines/SliceRDQN/train.py index 56d136e..75f7c14 100755 --- a/l2rpn_baselines/SliceRDQN/train.py +++ b/l2rpn_baselines/SliceRDQN/train.py @@ -17,6 +17,7 @@ from grid2op.Parameters import Parameters from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN as RDQNAgent +from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config as RDQNConfig DEFAULT_NAME = "SliceRDQN" DEFAULT_SAVE_DIR = "./models" @@ -34,7 +35,8 @@ def cli(): # Paths parser.add_argument("--name", required=False, default="SliceRDQN_ls", help="The name of the model") - parser.add_argument("--data_dir", required=False, default="l2rpn_case14_sandbox", + parser.add_argument("--data_dir", required=False, + default="l2rpn_case14_sandbox", help="Path to the dataset root directory") parser.add_argument("--save_dir", required=False, default=DEFAULT_SAVE_DIR, type=str, @@ -75,6 +77,11 @@ def train(env, batch_size=DEFAULT_BATCH_SIZE, learning_rate=DEFAULT_LR): + # Set config + RDQNConfig.LR = learning_rate + RDQNConfig.BATCH_SIZE = batch_size + RDQNConfig.TRACE_LENGTH = trace_length + # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices) > 0: @@ -83,10 +90,7 @@ def train(env, agent = RDQNAgent(env.observation_space, env.action_space, name=name, - is_training=True, - batch_size=batch_size, - trace_length=trace_length, - lr=learning_rate) + is_training=True) if load_path is not None: agent.load(load_path) From e0ab11f4349640ab8fe7d37108a4dc0e01c676ee Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 11 Jun 2020 11:55:28 +0200 Subject: [PATCH 16/24] Adds verbosity control and test for D3QN, SRDQN & RDQN --- .../DoubleDuelingDQN/DoubleDuelingDQN.py | 14 +- .../DoubleDuelingDQNConfig.py | 1 + .../DoubleDuelingDQN/DoubleDuelingDQN_NN.py | 2 +- l2rpn_baselines/DoubleDuelingDQN/evaluate.py | 36 ++--- l2rpn_baselines/DoubleDuelingDQN/train.py | 8 +- .../DoubleDuelingRDQN/DoubleDuelingRDQN.py | 11 +- .../DoubleDuelingRDQNConfig.py | 1 + .../DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py | 2 +- l2rpn_baselines/DoubleDuelingRDQN/evaluate.py | 31 +++-- l2rpn_baselines/DoubleDuelingRDQN/train.py | 6 +- l2rpn_baselines/SliceRDQN/SliceRDQN.py | 10 +- l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py | 1 + l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py | 2 +- l2rpn_baselines/SliceRDQN/evaluate.py | 33 +++-- l2rpn_baselines/SliceRDQN/train.py | 6 +- l2rpn_baselines/test/test_train_eval.py | 126 +++++++++++++++++- 16 files changed, 219 insertions(+), 71 deletions(-) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py index 782b96c..316649c 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py @@ -225,7 +225,7 @@ def train(self, env, if self.done: new_obs = env.reset() # This shouldn't raise self.reset(new_obs) - if step % 1000 == 0: + if cfg.VERBOSE and step % 1000 == 0: print("Step [{}] -- Random [{}]".format(step, self.epsilon)) # Save current observation to stacking buffer @@ -248,7 +248,8 @@ def train(self, env, new_state = self.convert_obs(new_obs) if info["is_illegal"] or info["is_ambiguous"] or \ info["is_dispatching_illegal"] or info["is_illegal_reco"]: - print (a, info) + if cfg.VERBOSE: + print (a, info) # Save new observation to stacking buffer self._save_next_frame(new_state) @@ -286,8 +287,9 @@ def train(self, env, if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) - print("Survived [{}] steps".format(alive_steps)) - print("Total reward [{}]".format(total_reward)) + if cfg.VERBOSE: + print("Survived [{}] steps".format(alive_steps)) + print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: @@ -375,5 +377,5 @@ def _batch_train(self, training_step, step): tf.summary.scalar("mean_alive_100", mean_alive_100, step) tf.summary.scalar("loss", loss, step) tf.summary.scalar("lr", self.Qmain.train_lr, step) - - print("loss =", loss) + if cfg.VERBOSE: + print("loss =", loss) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py index 35164c2..2f3c7a0 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py @@ -17,6 +17,7 @@ class DoubleDuelingDQNConfig(): N_FRAMES = 4 BATCH_SIZE = 32 LR = 1e-5 + VERBOSE = True @staticmethod def from_json(json_in_path): diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py index c33db0c..685c5fe 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py @@ -151,5 +151,5 @@ def save_network(self, path): def load_network(self, path): # Load from a model.h5 file self.model.load_weights(path) - print("Succesfully loaded network from: {}".format(path)) + print("Successfully loaded network from: {}".format(path)) diff --git a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py index ab357a0..2bffa07 100755 --- a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py +++ b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py @@ -26,7 +26,7 @@ DEFAULT_NB_PROCESS = 1 DEFAULT_MAX_STEPS = -1 DEFAULT_NUM_FRAMES = 4 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Eval baseline DDDQN") @@ -63,18 +63,19 @@ def evaluate(env, nb_process=DEFAULT_NB_PROCESS, max_steps=DEFAULT_MAX_STEPS, num_frames=DEFAULT_NUM_FRAMES, - verbose=False, + verbose=DEFAULT_VERBOSE, save_gif=False): # Set config D3QNConfig.N_FRAMES = num_frames - + D3QNConfig.VERBOSE = verbose + # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], True) runner_params = env.get_params_for_runner() - runner_params["verbose"] = args.verbose + runner_params["verbose"] = verbose # Create agent agent = D3QNAgent(env.observation_space, @@ -90,10 +91,11 @@ def evaluate(env, agentInstance=agent) # Print model summary - stringlist = [] - agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) - short_model_summary = "\n".join(stringlist) - print(short_model_summary) + if verbose: + stringlist = [] + agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + print(short_model_summary) # Run os.makedirs(logs_path, exist_ok=True) @@ -101,19 +103,23 @@ def evaluate(env, nb_episode=nb_episode, nb_process=nb_process, max_iter=max_steps, - pbar=True) + pbar=verbose) # Print summary - print("Evaluation summary:") - for _, chron_name, cum_reward, nb_time_step, max_ts in res: - msg_tmp = "chronics at: {}".format(chron_name) - msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) - msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) - print(msg_tmp) + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, + max_ts) + print(msg_tmp) if save_gif: save_log_gif(logs_path, res) + return res + if __name__ == "__main__": # Parse command line args = cli() diff --git a/l2rpn_baselines/DoubleDuelingDQN/train.py b/l2rpn_baselines/DoubleDuelingDQN/train.py index 6d5a4de..21f0f58 100755 --- a/l2rpn_baselines/DoubleDuelingDQN/train.py +++ b/l2rpn_baselines/DoubleDuelingDQN/train.py @@ -22,7 +22,7 @@ DEFAULT_N_FRAMES = 4 DEFAULT_BATCH_SIZE = 32 DEFAULT_LR = 1e-5 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Train baseline DDQN") @@ -67,13 +67,15 @@ def train(env, num_pre_training_steps = DEFAULT_PRE_STEPS, num_frames = DEFAULT_N_FRAMES, batch_size= DEFAULT_BATCH_SIZE, - learning_rate= DEFAULT_LR): + learning_rate= DEFAULT_LR, + verbose=DEFAULT_VERBOSE): # Set config D3QNConfig.LR = learning_rate D3QNConfig.N_FRAMES = num_frames D3QNConfig.BATCH_SIZE = batch_size - + D3QNConfig.VERBOSE = verbose + # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices) > 0: diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py index 899e85b..71000b6 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py @@ -196,7 +196,7 @@ def train(self, env, episode += 1 episode_exp = [] - if step % 1000 == 0: + if cfg.VERBOSE and step % 1000 == 0: print("Step [{}] -- Dropout [{}]".format(step, epsilon)) # Choose an action @@ -259,8 +259,9 @@ def train(self, env, if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) - print("Survived [{}] steps".format(alive_steps)) - print("Total reward [{}]".format(total_reward)) + if cfg.VERBOSE: + print("Survived [{}] steps".format(alive_steps)) + print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: @@ -349,7 +350,9 @@ def _batch_train(self, batch, step, training_step): # Log some useful metrics if step % (cfg.UPDATE_FREQ * 2) == 0: - print("loss =", loss) + if cfg.VERBOSE: + print("loss =", loss) + with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py index 108a488..47b72c8 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py @@ -14,6 +14,7 @@ class DoubleDuelingRDQNConfig(): TRACE_LENGTH = 8 BATCH_SIZE = 32 LR = 1e-5 + VERBOSE = True @staticmethod def from_json(json_in_path): diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py index 367265e..884a7ec 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py @@ -168,4 +168,4 @@ def save_network(self, path): def load_network(self, path): # nothing has changed self.model.load_weights(path) - print("Succesfully loaded network from: {}".format(path)) + print("Successfully loaded network from: {}".format(path)) diff --git a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py index 1b07372..490eb6c 100755 --- a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py @@ -25,6 +25,7 @@ DEFAULT_NB_EPISODE = 1 DEFAULT_NB_PROCESS = 1 DEFAULT_MAX_STEPS = -1 +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Eval baseline DDDQN") @@ -57,7 +58,7 @@ def evaluate(env, nb_episode=DEFAULT_NB_EPISODE, nb_process=DEFAULT_NB_PROCESS, max_steps=DEFAULT_MAX_STEPS, - verbose=False, + verbose=DEFAULT_VERBOSE, save_gif=False): # Limit gpu usage @@ -65,7 +66,7 @@ def evaluate(env, tf.config.experimental.set_memory_growth(physical_devices[0], True) runner_params = env.get_params_for_runner() - runner_params["verbose"] = args.verbose + runner_params["verbose"] = verbose # Run # Create agent @@ -82,10 +83,11 @@ def evaluate(env, agentInstance=agent) # Print model summary - stringlist = [] - agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) - short_model_summary = "\n".join(stringlist) - print(short_model_summary) + if verbose: + stringlist = [] + agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + print(short_model_summary) # Run os.makedirs(logs_path, exist_ok=True) @@ -93,19 +95,22 @@ def evaluate(env, nb_episode=nb_episode, nb_process=nb_process, max_iter=max_steps, - pbar=True) + pbar=verbose) # Print summary - print("Evaluation summary:") - for _, chron_name, cum_reward, nb_time_step, max_ts in res: - msg_tmp = "chronics at: {}".format(chron_name) - msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) - msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) - print(msg_tmp) + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, + max_ts) + print(msg_tmp) if save_gif: save_log_gif(logs_path, res) + return res if __name__ == "__main__": # Parse command line diff --git a/l2rpn_baselines/DoubleDuelingRDQN/train.py b/l2rpn_baselines/DoubleDuelingRDQN/train.py index 582a466..d75f216 100755 --- a/l2rpn_baselines/DoubleDuelingRDQN/train.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/train.py @@ -26,7 +26,7 @@ DEFAULT_TRACE_LEN = 12 DEFAULT_BATCH_SIZE = 32 DEFAULT_LR = 1e-5 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Train baseline DDQN") @@ -73,12 +73,14 @@ def train(env, num_pre_training_steps=DEFAULT_PRE_STEPS, trace_length=DEFAULT_TRACE_LEN, batch_size=DEFAULT_BATCH_SIZE, - learning_rate=DEFAULT_LR): + learning_rate=DEFAULT_LR, + verbose=DEFAULT_VERBOSE): # Set config RDQNConfig.TRACE_LENGTH = trace_length RDQNConfig.BATCH_SIZE = batch_size RDQNConfig.LR = learning_rate + RDQNConfig.VERBOSE = verbose # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN.py b/l2rpn_baselines/SliceRDQN/SliceRDQN.py index ccc2e24..df093f2 100644 --- a/l2rpn_baselines/SliceRDQN/SliceRDQN.py +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN.py @@ -216,7 +216,7 @@ def shuff(x): episode += 1 episode_exp = [] - if step % cfg.SUFFLE_FREQ == 0: + if cfg.VERBOSE and step % cfg.SUFFLE_FREQ == 0: print("Step [{}] -- Dropout [{}]".format(step, self.epsilon)) # Choose an action @@ -274,8 +274,9 @@ def shuff(x): if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) - print("Survived [{}] steps".format(alive_steps)) - print("Total reward [{}]".format(total_reward)) + if cfg.VERBOSE: + print("Survived [{}] steps".format(alive_steps)) + print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: @@ -367,7 +368,8 @@ def _batch_train(self, batch, training_step, step): loss = self.Qmain.model.train_on_batch(batch_x, batch_y) loss = loss[0] - print("loss =", loss) + if cfg.VERBOSE: + print("loss =", loss) with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py index 16ffffa..bd61ce5 100644 --- a/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py @@ -16,6 +16,7 @@ class SliceRDQN_Config(): TRACE_LENGTH = 8 BATCH_SIZE = 32 LR = 1e-5 + VERBOSE = True @staticmethod def from_json(json_in_path): diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py b/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py index 2d0d25e..3c9d832 100644 --- a/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py +++ b/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py @@ -243,5 +243,5 @@ def save_network(self, path): def load_network(self, path): # nothing has changed self.model.load_weights(path) - print("Succesfully loaded network from: {}".format(path)) + print("Successfully loaded network from: {}".format(path)) diff --git a/l2rpn_baselines/SliceRDQN/evaluate.py b/l2rpn_baselines/SliceRDQN/evaluate.py index 5680e75..b977e78 100755 --- a/l2rpn_baselines/SliceRDQN/evaluate.py +++ b/l2rpn_baselines/SliceRDQN/evaluate.py @@ -24,7 +24,7 @@ DEFAULT_NB_EPISODE = 1 DEFAULT_NB_PROCESS = 1 DEFAULT_MAX_STEPS = -1 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Eval baseline DDDQN") @@ -57,7 +57,7 @@ def evaluate(env, nb_episode=DEFAULT_NB_EPISODE, nb_process=DEFAULT_NB_PROCESS, max_steps=DEFAULT_MAX_STEPS, - verbose=False, + verbose=DEFAULT_VERBOSE, save_gif=False): # Limit gpu usage @@ -66,7 +66,7 @@ def evaluate(env, tf.config.experimental.set_memory_growth(physical_devices[0], True) runner_params = env.get_params_for_runner() - runner_params["verbose"] = args.verbose + runner_params["verbose"] = verbose # Run # Create agent @@ -83,10 +83,11 @@ def evaluate(env, agentInstance=agent) # Print model summary - stringlist = [] - agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) - short_model_summary = "\n".join(stringlist) - print(short_model_summary) + if verbose: + stringlist = [] + agent.Qmain.model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + print(short_model_summary) # Run os.makedirs(logs_path, exist_ok=True) @@ -94,19 +95,23 @@ def evaluate(env, nb_episode=nb_episode, nb_process=nb_process, max_iter=max_steps, - pbar=True) + pbar=verbose) # Print summary - print("Evaluation summary:") - for _, chron_name, cum_reward, nb_time_step, max_ts in res: - msg_tmp = "chronics at: {}".format(chron_name) - msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) - msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) - print(msg_tmp) + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal reward: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, + max_ts) + print(msg_tmp) if save_gif: save_log_gif(logs_path, res) + return res + if __name__ == "__main__": # Parse command line diff --git a/l2rpn_baselines/SliceRDQN/train.py b/l2rpn_baselines/SliceRDQN/train.py index 75f7c14..a5a8c0d 100755 --- a/l2rpn_baselines/SliceRDQN/train.py +++ b/l2rpn_baselines/SliceRDQN/train.py @@ -27,7 +27,7 @@ DEFAULT_TRACE_LEN = 12 DEFAULT_BATCH_SIZE = 32 DEFAULT_LR = 1e-5 - +DEFAULT_VERBOSE = True def cli(): parser = argparse.ArgumentParser(description="Train baseline DDQN") @@ -75,12 +75,14 @@ def train(env, num_pre_training_steps=DEFAULT_PRE_STEPS, trace_length=DEFAULT_TRACE_LEN, batch_size=DEFAULT_BATCH_SIZE, - learning_rate=DEFAULT_LR): + learning_rate=DEFAULT_LR, + verbose=DEFAULT_VERBOSE): # Set config RDQNConfig.LR = learning_rate RDQNConfig.BATCH_SIZE = batch_size RDQNConfig.TRACE_LENGTH = trace_length + RDQNConfig.VERBOSE = verbose # Limit gpu usage physical_devices = tf.config.list_physical_devices('GPU') diff --git a/l2rpn_baselines/test/test_train_eval.py b/l2rpn_baselines/test/test_train_eval.py index 4c549bb..eec3f6c 100644 --- a/l2rpn_baselines/test/test_train_eval.py +++ b/l2rpn_baselines/test/test_train_eval.py @@ -16,12 +16,21 @@ from l2rpn_baselines.utils import TrainingParam, NNParam from l2rpn_baselines.DeepQSimple import train as train_dqn from l2rpn_baselines.DeepQSimple import evaluate as eval_dqn -from l2rpn_baselines.DuelQSimple import train as train_d3qn -from l2rpn_baselines.DuelQSimple import evaluate as eval_d3qn +from l2rpn_baselines.DuelQSimple import train as train_d3qs +from l2rpn_baselines.DuelQSimple import evaluate as eval_d3qs from l2rpn_baselines.SAC import train as train_sac from l2rpn_baselines.SAC import evaluate as eval_sac from l2rpn_baselines.DuelQLeapNet import train as train_leap from l2rpn_baselines.DuelQLeapNet import evaluate as eval_leap +from l2rpn_baselines.DoubleDuelingDQN import train as train_d3qn +from l2rpn_baselines.DoubleDuelingDQN import evaluate as eval_d3qn +from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQNConfig as d3qn_cfg +from l2rpn_baselines.DoubleDuelingRDQN import train as train_rqn +from l2rpn_baselines.DoubleDuelingRDQN import evaluate as eval_rqn +from l2rpn_baselines.DoubleDuelingRDQN import DoubleDuelingRDQNConfig as rdqn_cfg +from l2rpn_baselines.SliceRDQN import train as train_srqn +from l2rpn_baselines.SliceRDQN import evaluate as eval_srqn +from l2rpn_baselines.SliceRDQN import SliceRDQN_Config as srdqn_cfg os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -105,7 +114,7 @@ def test_train_eval(self): "set_topo_vect": False } nm_ = "AnneOnymous" - train_d3qn(env, + train_d3qs(env, name=nm_, iterations=100, save_path=tmp_dir, @@ -117,7 +126,7 @@ def test_train_eval(self): kwargs_converters=kwargs_converters, kwargs_archi=kwargs_archi) - baseline_2 = eval_d3qn(env, + baseline_2 = eval_d3qs(env, name=nm_, load_path=tmp_dir, logs_path=tmp_dir, @@ -248,6 +257,113 @@ def test_train_eval(self): verbose=False, save_gif=False) +class TestD3QN(unittest.TestCase): + def test_train_eval(self): + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + nm_ = "test_D3QN" + + d3qn_cfg.INITIAL_EPISLON = 1.0 + d3qn_cfg.FINAL_EPISLON = 0.01 + d3qn_cfg.EPISLON_DECAY = 20 + d3qn_cfg.UPDATE_FREQ = 16 + + train_d3qn(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_path=tmp_dir, + learning_rate=1e-4, + verbose=False, + num_pre_training_steps=32, + num_frames=4, + batch_size=8) + + model_path = os.path.join(tmp_dir, nm_ + ".h5") + eval_res = eval_d3qn(env, + load_path=model_path, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=10, + verbose=False, + save_gif=False) + + assert eval_res is not None + +class TestRDQN(unittest.TestCase): + def test_train_eval(self): + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + nm_ = "test_RDQN" + rdqn_cfg.INITIAL_EPISLON = 1.0 + rdqn_cfg.FINAL_EPISLON = 0.01 + rdqn_cfg.EPISLON_DECAY = 20 + rdqn_cfg.UPDATE_FREQ = 16 + + train_rqn(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_path=tmp_dir, + learning_rate=1e-4, + verbose=False, + num_pre_training_steps=16, + batch_size=8) + + model_path = os.path.join(tmp_dir, nm_ + ".tf") + eval_res = eval_rqn(env, + load_path=model_path, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=10, + verbose=False, + save_gif=False) + + assert eval_res is not None + +class TestSRDQN(unittest.TestCase): + def test_train_eval(self): + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + nm_ = "test_SRDQN" + srdqn_cfg.INITIAL_EPISLON = 1.0 + srdqn_cfg.FINAL_EPISLON = 0.01 + srdqn_cfg.EPISLON_DECAY = 20 + srdqn_cfg.UPDATE_FREQ = 16 + + train_srqn(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_path=tmp_dir, + learning_rate=1e-4, + verbose=False, + num_pre_training_steps=32, + batch_size=8) + + model_path = os.path.join(tmp_dir, nm_ + ".tf") + eval_res = eval_srqn(env, + load_path=model_path, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=10, + verbose=False, + save_gif=False) + + assert eval_res is not None + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From 1eba79847d6f788170a95a6cf1b8e84b578cb5a8 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 11 Jun 2020 12:02:16 +0200 Subject: [PATCH 17/24] Unlink dev submodule WolperGrid --- .gitmodules | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 2cf73ec..4e37359 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,9 +4,6 @@ [submodule "l2rpn_baselines/Geirina"] path = l2rpn_baselines/Geirina url = https://github.com/djmax008/GEIRINA_baseline -[submodule "l2rpn_baselines/WolperGrid"] - path = l2rpn_baselines/WolperGrid - url = git@github.com:Tezirg/WolperGrid.git [submodule "l2rpn_baselines/AsynchronousActorCritic"] path = l2rpn_baselines/AsynchronousActorCritic url = https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git From 816124fdd4b897c434d1d125111127897f75ab2d Mon Sep 17 00:00:00 2001 From: Tezirg Date: Thu, 11 Jun 2020 12:04:31 +0200 Subject: [PATCH 18/24] Unlink dev submodule --- l2rpn_baselines/WolperGrid | 1 - 1 file changed, 1 deletion(-) delete mode 160000 l2rpn_baselines/WolperGrid diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid deleted file mode 160000 index 2d782ee..0000000 --- a/l2rpn_baselines/WolperGrid +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2d782eef2417c07db35fa13552fd4d5bc0418d37 From 81c4ffc503fdf6b9ca7c66059c6b1a3e9edc75df Mon Sep 17 00:00:00 2001 From: Tezirg Date: Fri, 12 Jun 2020 16:49:14 +0200 Subject: [PATCH 19/24] Adds WolperGrid Agent for dev --- .gitmodules | 3 +++ l2rpn_baselines/WolperGrid | 1 + 2 files changed, 4 insertions(+) create mode 160000 l2rpn_baselines/WolperGrid diff --git a/.gitmodules b/.gitmodules index 4e37359..baf80b8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "l2rpn_baselines/AsynchronousActorCritic"] path = l2rpn_baselines/AsynchronousActorCritic url = https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git +[submodule "l2rpn_baselines/WolperGrid"] + path = l2rpn_baselines/WolperGrid + url = git@github.com:Tezirg/WolperGrid.git diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid new file mode 160000 index 0000000..be6b025 --- /dev/null +++ b/l2rpn_baselines/WolperGrid @@ -0,0 +1 @@ +Subproject commit be6b02566a956f302b170de05d1845c1d96789c6 From c4eeefbd0d5981988b76f31418f9d03e29591ff9 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Fri, 12 Jun 2020 17:59:49 +0200 Subject: [PATCH 20/24] D3QN Config class docstring --- l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py index 2f3c7a0..a343692 100644 --- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py @@ -2,6 +2,11 @@ import json class DoubleDuelingDQNConfig(): + """ + DoubleDuelingDQN configurable hyperparameters + exposed as class attributes + """ + LR_DECAY_STEPS = 1024*64 LR_DECAY_RATE = 0.95 INITIAL_EPSILON = 0.99 From c8d8813c1632839b7c854b72999f5ce426ec7f29 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Fri, 12 Jun 2020 18:02:01 +0200 Subject: [PATCH 21/24] D3QN sphynx documentation --- docs/DoubleDuelingDQN.rst | 56 +++++++++++++++++++ docs/index.rst | 2 + .../DoubleDuelingDQNConfig.py | 45 +++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 docs/DoubleDuelingDQN.rst create mode 100644 l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py diff --git a/docs/DoubleDuelingDQN.rst b/docs/DoubleDuelingDQN.rst new file mode 100644 index 0000000..3944bfa --- /dev/null +++ b/docs/DoubleDuelingDQN.rst @@ -0,0 +1,56 @@ +DoubleDuelingDQN: A example implementation of Double Duelling Deep Q Network +============================================================================ + +Description +----------- +This module serves as an concrete example on how to implement a D3QN baseline. +This baseline is of type Double Duelling Deep Q Network, as in Duelling Q Network and DoubleQ update. + +It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results. + + +Agent class +------------------------ +You can use this class with: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQN + from l2rpn_baselines.DoubleDuelingDQN import train + from l2rpn_baselines.DoubleDuelingDQN import evaluate + +.. automodule:: l2rpn_baselines.DoubleDuelingDQN + :members: + :autosummary: + +Configuration +------------------------ +Training a model requires tweaking many hyperparameters, these can be found in a specific class attributes: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQNConfig + + # Set hyperparameters before training + DoubleDuelingDQNConfig.LR = 1e-5 + DoubleDuelingDQNConfig.INITAL_EPSILON = 1.0 + DoubleDuelingDQNConfig.FINAL_EPSILON = 0.001 + DoubleDuelingDQNConfig.DECAY_EPSILON = 10000 + +.. automodule:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig + :members: + :undoc-members: + +Internal classes +------------------------ +The neural network model is defined in a separate class. +You may want to import it manually: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN + + +.. autoclass:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN.DoubleDuelingDQN_NN + :members: + :autosummary: diff --git a/docs/index.rst b/docs/index.rst index ed44a1e..81b4f1b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,6 +28,8 @@ Baseline already Available DeepQSimple DuelQSimple SAC + DoubleDuelingDQN + More advanced baselines ------------------------ diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py new file mode 100644 index 0000000..a343692 --- /dev/null +++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py @@ -0,0 +1,45 @@ +import os +import json + +class DoubleDuelingDQNConfig(): + """ + DoubleDuelingDQN configurable hyperparameters + exposed as class attributes + """ + + LR_DECAY_STEPS = 1024*64 + LR_DECAY_RATE = 0.95 + INITIAL_EPSILON = 0.99 + FINAL_EPSILON = 0.001 + DECAY_EPSILON = 1024*64 + DISCOUNT_FACTOR = 0.98 + PER_CAPACITY = 1024*64 + PER_ALPHA = 0.7 + PER_BETA = 0.5 + UPDATE_FREQ = 28 + UPDATE_TARGET_HARD_FREQ = -1 + UPDATE_TARGET_SOFT_TAU = 1e-3 + N_FRAMES = 4 + BATCH_SIZE = 32 + LR = 1e-5 + VERBOSE = True + + @staticmethod + def from_json(json_in_path): + with open(json_in_path, 'r') as fp: + conf_json = json.load(fp) + + for k,v in conf_json.items(): + if hasattr(DoubleDuelingDQNConfig, k): + setattr(DoubleDuelingDQNConfig, k, v) + + @staticmethod + def to_json(json_out_path): + conf_json = {} + for attr in dir(DoubleDuelingDQNConfig): + if attr.startswith('__') or callable(attr): + continue + conf_json[attr] = getattr(DoubleDuelingDQNConfig, attr) + + with open(json_out_path, 'w+') as fp: + json.dump(fp, conf_json, indent=2) From 8e1c71700bb1e16acafff9a9b69ca0a44448f101 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Fri, 12 Jun 2020 18:18:14 +0200 Subject: [PATCH 22/24] RDQN: Sphynx doctring --- docs/DoubleDuelingDQN.rst | 2 +- docs/DoubleDuelingRDQN.rst | 54 +++++++++++++++++++ docs/index.rst | 1 + .../DoubleDuelingRDQNConfig.py | 4 ++ 4 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 docs/DoubleDuelingRDQN.rst diff --git a/docs/DoubleDuelingDQN.rst b/docs/DoubleDuelingDQN.rst index 3944bfa..df13a60 100644 --- a/docs/DoubleDuelingDQN.rst +++ b/docs/DoubleDuelingDQN.rst @@ -19,7 +19,7 @@ You can use this class with: from l2rpn_baselines.DoubleDuelingDQN import train from l2rpn_baselines.DoubleDuelingDQN import evaluate -.. automodule:: l2rpn_baselines.DoubleDuelingDQN +.. automodule:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN :members: :autosummary: diff --git a/docs/DoubleDuelingRDQN.rst b/docs/DoubleDuelingRDQN.rst new file mode 100644 index 0000000..c286e1f --- /dev/null +++ b/docs/DoubleDuelingRDQN.rst @@ -0,0 +1,54 @@ +DoubleDuelingRDQN: A example implementation of Recurrent DoubleQ Network +======================================================================== + +Description +----------- +This module serves as an concrete example on how to implement a recurrent D3QN baseline. +This baseline is of type Recurrent Double Duelling Deep Q Network, as in Duelling Q, DoubleQ update and recurrent neural network. + +It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results. + + +Agent class +------------------------ +You can use this class with: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingRDQN import DoubleDuelingRDQN + from l2rpn_baselines.DoubleDuelingRDQN import train + from l2rpn_baselines.DoubleDuelingRDQN import evaluate + +.. automodule:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN + :members: + :autosummary: + +Configuration +------------------------ +Training a model requires tweaking many hyperparameters, these can be found in a specific class attributes: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingRDQN import DoubleDuelingRDQNConfig + + # Set hyperparameters before training + DoubleDuelingRDQNConfig.LR = 1e-5 + DoubleDuelingRDQNConfig.TRACE_LENGTH = 12 + +.. automodule:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig + :members: + :undoc-members: + +Internal classes +------------------------ +The neural network model is defined in a separate class. +You may want to import it manually: + +.. code-block:: python + + from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN import DoubleDuelingRDQN_NN + + +.. autoclass:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN.DoubleDuelingRDQN_NN + :members: + :autosummary: diff --git a/docs/index.rst b/docs/index.rst index 81b4f1b..3faf82f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,6 +29,7 @@ Baseline already Available DuelQSimple SAC DoubleDuelingDQN + DoubleDuelingRDQN More advanced baselines diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py index 47b72c8..321fc29 100644 --- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py +++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py @@ -2,6 +2,10 @@ import json class DoubleDuelingRDQNConfig(): + """ + DoubleDuelingRDQN configurable hyperparameters as class attributes + """ + INITIAL_EPSILON = 0.99 FINAL_EPSILON = 0.01 DECAY_EPSILON = 1024*32 From b16b9bdb263ca7d8230d6faf5721d7c7efa65b6b Mon Sep 17 00:00:00 2001 From: Tezirg Date: Mon, 15 Jun 2020 15:46:31 +0200 Subject: [PATCH 23/24] Removes dev submodule --- .gitmodules | 5 +---- l2rpn_baselines/WolperGrid | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) delete mode 160000 l2rpn_baselines/WolperGrid diff --git a/.gitmodules b/.gitmodules index baf80b8..6ade8ce 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,7 +6,4 @@ url = https://github.com/djmax008/GEIRINA_baseline [submodule "l2rpn_baselines/AsynchronousActorCritic"] path = l2rpn_baselines/AsynchronousActorCritic - url = https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git -[submodule "l2rpn_baselines/WolperGrid"] - path = l2rpn_baselines/WolperGrid - url = git@github.com:Tezirg/WolperGrid.git + url = https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git \ No newline at end of file diff --git a/l2rpn_baselines/WolperGrid b/l2rpn_baselines/WolperGrid deleted file mode 160000 index be6b025..0000000 --- a/l2rpn_baselines/WolperGrid +++ /dev/null @@ -1 +0,0 @@ -Subproject commit be6b02566a956f302b170de05d1845c1d96789c6 From cc309df18dc624961405217f4090496ee64ffb21 Mon Sep 17 00:00:00 2001 From: Tezirg Date: Tue, 16 Jun 2020 09:31:16 +0200 Subject: [PATCH 24/24] Doc: Ordering index --- docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 3faf82f..6492796 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -26,10 +26,9 @@ Baseline already Available utils DeepQSimple + DoubleDuelingDQN DuelQSimple SAC - DoubleDuelingDQN - DoubleDuelingRDQN More advanced baselines @@ -39,6 +38,7 @@ More advanced baselines :maxdepth: 2 DuelQLeapNet + DoubleDuelingRDQN Contributions