-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdqn.py
270 lines (228 loc) · 10.5 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import time
import timeit
import math
import os
import matplotlib.pyplot as plt
import pandas as pd
import gym
from gym import wrappers
import random
import collections
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from operator import add
from operator import sub
################################################################
# suppress tensorflow logs.
# Solution from https://stackoverflow.com/a/57497465
import logging, os
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
################################################################
class ReplayBuffer: # Replay buffer class
def __init__(self, bufferlimit):
self.bufferlimit = bufferlimit
self.buffer = collections.deque(maxlen= self.bufferlimit)
def put(self, transition):
'''
Function used to insert experiences in the experience replay buffer
'''
if(self.size() >= self.bufferlimit):
self.buffer.pop()
self.buffer.append(transition)
def sample(self, mini_batch_size=32):
'''
Function used to randomly sample MINI_BATCH_SIZE of experiences from the experience replay buffer.
'''
mini_batch = random.sample(self.buffer, min(len(self.buffer), mini_batch_size))
return mini_batch
def size(self):
'''
Returns the total size of the experience replay buffer.
'''
return len(self.buffer)
######################################################################
class Qnet: # Q-network class
def __init__(self, hidden_layers = 2, observation_space = 4, action_space = 2, learning_rate = 0.0001, units = 64):
'''
creates the model of the neural network with the given hyperparameters.
'''
self.input_shape = observation_space
self.output_shape = action_space
self.learning_rate = learning_rate
self.units = units
self.model = tf.keras.models.Sequential()
self.model.add(tf.keras.layers.Dense(self.units, input_shape=(self.input_shape,), activation = "relu")) # 1st hidden layer
for i in range(hidden_layers-1): # create the rest of the hidden layers
self.model.add(tf.keras.layers.Dense(self.units, activation = "relu"))
self.model.add(tf.keras.layers.Dense(self.output_shape, activation = "linear")) # create output layer
self.model.compile(optimizer =tf.keras.optimizers.Adam(lr=self.learning_rate), loss="mse")
def train(self,q_target,replay,mini_batch_size=32):
'''
Function to train the main Q-network.
'''
mini_batch = replay.sample(mini_batch_size)
x = []
y = []
for s,a,r,s_prime,done in mini_batch:
max_future_q = np.amax(q_target.model.predict(tf.constant(s_prime,shape=(1,self.input_shape))))
target = r + DISCOUNT_RATE*max_future_q*np.invert(done) # calculate target using Q-target
current_q = self.model.predict(tf.constant(s,shape=(1,self.input_shape))) # current q_values for the actions
current_q[0][a] = target # updating the q_value of the chosen action to that of the target q value
x.append(s)
y.append(current_q)
x = tf.constant(x,shape=(len(x), self.input_shape))
y = tf.constant(y, shape=(len(y), self.output_shape))
self.model.fit(x,y)
def update_weight(self, q, ep_num, update_interval = 20, tau = 0.01, soft=False):
'''
Function to update the weights of the target network Q-target.
'''
if soft == False:
# perform hard update on the target network
if(ep_num % update_interval == 0):
target_theta = q.model.get_weights()
self.model.set_weights(target_theta)
print("Target Update")
else:
# perform soft update on the target network
q_theta = q.model.get_weights()
target_theta = self.model.get_weights()
counter = 0
for q_weight, target_weight in zip(q_theta,target_theta):
target_weight = target_weight * (1-tau) + q_weight * tau
target_theta[counter] = target_weight
counter += 1
self.model.set_weights(target_theta)
def sample_action(self, obs, epsilon):
'''
Function to select an action depending on the observation using epsilon greedy approach.
'''
coin = random.random()
if(coin<=epsilon):
return random.randint(0, 1) # returns values between 0 and 1
else:
return np.argmax(self.model.predict(obs))
######################################################################
def test(n_epi, epsilon, q, seed_val, vid_dir):
'''
Function to test the performance of the DQN agent.
The agent is run in a new test environment for 10 episodes.
The mean score is used to determine the performance of the agent.
'''
rwrd = [] # vector to store the rewards
test_env = gym.make("CartPole-v1") # new test environment
test_env.seed(seed_val) # setting seed for the test_env
for eps in range(10):
test_s = test_env.reset() # present state
test_done = False
test_score = 0
while not test_done:
test_a = np.argmax(q.model.predict(tf.constant(test_s,shape=(1,input_shape)))) # select action with epsilon-greedy policy
test_s_prime, test_r, test_done, test_info =test_env.step(test_a) # get next state, rewards, done flag
test_s = test_s_prime
test_score += test_r # update score for the current episode
if test_done:
rwrd.append(test_score)
break
mean_score = np.mean(rwrd) # calculate the mean score of the 10 test episodes
std_score = round(np.std(rwrd),3) # calculate the standard deviation in the scores
print("Episode: {}. Score: {}. Std: {}. Epsilon: {}".format(n_epi, mean_score, std_score, round(epsilon,3)))
return mean_score, std_score
if __name__=="__main__":
######## hyperparameters###################
BUFFERLIMIT = 50_000
MINI_BATCH_SIZE = 256
HIDDEN_LAYERS = 2
HIDDEN_LAYER_UNITS = 64
LEARNING_RATE = 0.0005
DISCOUNT_RATE = 0.99
EPISODES = 2000
UPDATE_TARGET_INTERVAL = 100 # target update interval for hard update
TAU = 0.0001 # used when soft update is used
############################################
seed_val = 3
soft_update = True # Set to True if you want to do soft update instead of hard update
log_dir = "final_experiment" # directory name to store log data
exp_name = "final_experiment" # name of the experiment
plot_title = "DQN-CartPole. Final Experiment" # Title of the plot
#############################################
# initialize random seeds
np.random.seed(seed_val)
tf.random.set_seed(seed_val)
random.seed(seed_val)
#############################################
env = gym.make("CartPole-v1") # select environment. Currently only tested on CartPole-v1
env.seed(seed_val) # setting seed for the env
#############################################
# create the main Q-network, the target network Q-target and the experience replay buffer
input_shape = env.observation_space.shape[0] # input shape for the neural net
output_shape = env.action_space.n # output shape for the neural net
q = Qnet(
hidden_layers = HIDDEN_LAYERS,
observation_space = input_shape,
action_space = output_shape,
learning_rate = LEARNING_RATE,
units = HIDDEN_LAYER_UNITS
) # the main Q-network
q_target = Qnet(
hidden_layers = HIDDEN_LAYERS,
observation_space = input_shape,
action_space = output_shape,
learning_rate = LEARNING_RATE,
units = HIDDEN_LAYER_UNITS
) # the target network Q-target
q_target.model.set_weights(q.model.get_weights()) # initializing Q-target weights to the weights of the main Q-network
memory = ReplayBuffer(bufferlimit = BUFFERLIMIT) # Experience Replay Buffer
#############################################
ep_vec = [] # Vector to store ep_number for plotting
mean_score_vec = [] # vector to store score in this episode for plotting
std_vec =[] # vector to store standard deviation of rewards
#############################################
savename = exp_name
performance_dir = "Performance/"+log_dir+"/"
if not os.path.exists(performance_dir):
os.mkdir(performance_dir)
save_performance = performance_dir+savename+".csv"
save_plot = performance_dir+savename+".png"
model_dir = "Model/"+log_dir+"/"
if not os.path.exists(model_dir):
os.mkdir(model_dir)
save_model = model_dir+"DQN_"+savename
vid_dir = "Video/"+log_dir+"/"
if not os.path.exists(vid_dir):
os.mkdir(vid_dir)
#############################################
for n_epi in range(EPISODES+1):
epsilon = max(0.01, (0.99 - 0.98/200*n_epi))
s = env.reset()
done = False
score = 0.
#############################################
while not done:
a = q.sample_action(tf.constant(s,shape=(1,input_shape)), epsilon) #select action from updated q net
s_prime, r, done, info = env.step(a)
memory.put((s,a,r,s_prime,int(done))) # insert into experience replay
s = s_prime
score += r
if done:
break
#############################################
if(memory.size() >= 1000):
q.train(q_target, memory, MINI_BATCH_SIZE) # update q net
q_target.update_weight(q, ep_num=n_epi, update_interval=UPDATE_TARGET_INTERVAL, tau=TAU, soft=soft_update)
if(n_epi % 10 ==0):
ep_vec.append(n_epi)
mean_, std_, frame_number =test(n_epi, epsilon, q, seed_val, vid_dir, frame_number)
mean_score_vec.append(mean_)
std_vec.append(std_)
# save the main Q-network weights after every 10 episodes
ep_model = save_model+"_"+str(n_epi)
if not os.path.exists(ep_model):
os.mkdir(ep_model)
q.model.save_weights(ep_model+"/"+"DQN_"+savename+"_"+str(n_epi))
#############################################
performance_data = {'Episode':ep_vec, 'mean_score': mean_score_vec, 'std_dev': std_vec}
df = pd.DataFrame(performance_data)
df.to_csv(save_performance, index=False)