forked from akashe/DeepReinforcementLearning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDQN.py
471 lines (401 loc) · 19.1 KB
/
DQN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
import collections
import torch
from RLUtils import MeanSquarredError
from RLUtils import SGD
from RLUtils import appendOnes
import gym
import random
import time
import math
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd.functional import vjp
import matplotlib.pyplot as plt
from RLUtils import create_network_with_nn
from copy import deepcopy
random.seed(7)
torch.manual_seed(1334)
'''
Idea:
The idea is to build a simple dqn algo with only linear transformation and updating the parameters with SGD
I will use simple LunarLander gym.
The idea is to get a feel of the algo not to extend functionality. So the implementation will be bare minimum.
For v2, the implementation uses vjp not traditional torch.backwards
For v3, it will have proper networks and gradient clipping and proper documentation.
Versions and analysis:
v1: simple linear transform
Result: model underfitting..though with 5000 games and 50 episodes for switch the lander did pretty good
v2: add non linearity and appropriate gradients..
Mistake in v2 that I used only 1 non-linearity. The average rewards was negative for v2 and loss also
not decreasing.
v3:
gradient clipping is good. The algo works better and faster with bigger model capacity like for lunar lander [256,128,64]
rather than [50,10]. A very small Replaybuffer doesn't work well. Learning and switching the networks together
speeds up getting good rewards.
'''
class DQNAgentv3:
def __init__(self, discounted_factor, epsilon_start, epsilon_decay, epsilon_end, observation_size, no_of_actions,
batch_size, learning_rate, max_memory_size, mid_layers=[256,128,64], gradient_clip=1.,polyak=0.995):
'''
:param discounted_factor: discount factor for future rewards
:param epsilon_start: starting value of epsilon in e-greedy exploration
:param epsilon_decay: rate of epsilon decay
:param epsilon_end: ending value of epsilon
:param observation_size: state size or observation size or input size
:param no_of_actions: total number of discrete actions
:param batch_size: batch size for experience replay
:param max_memory_size: total size of memory buffer
'''
self.d = discounted_factor
self.e = epsilon_start
self.e_decay = epsilon_decay
self.e_end = epsilon_end
self.o_size = observation_size
self.n_actions = no_of_actions
self.batch_size = batch_size
self.memory = collections.deque(maxlen=max_memory_size)
self.lr = learning_rate
# Initializing two sets of network parameters
# Target network
self.target_network = create_network_with_nn([observation_size, *mid_layers, no_of_actions])
# Current network
self.current_network = create_network_with_nn([observation_size, *mid_layers, no_of_actions])
self.loss = []
self.step_counter = 0
self.learn_counter = 0
self.optim = optim.Adam(self.current_network.parameters(), lr=self.lr)
self.gradient_clip = gradient_clip
self.polyak = polyak
def store_experience(self, experience):
# observation, action, reward, next_observation, done = experience
self.memory.appendleft(experience)
def sample_experience(self):
# sample from the replay experience of the length batch_size
return random.sample(self.memory, self.batch_size)
def switch_networks(self):
# Switch target and current network after every fixed number of games
# Replacing deep copy with polyak averaging
# self.target_network = deepcopy(self.current_network)
for cur_params, tar_prams in zip(self.current_network.parameters(), self.target_network.parameters()):
tar_prams.data.mul_(self.polyak)
tar_prams.data.add_((1 - self.polyak) * cur_params.data)
def reduce_epsilon(self):
# Reducing episolon value over time
# self.e = self.e - self.e_decay if self.e > self.e_end else self.e_end
self.e = max(self.e_end,self.e*self.e_decay)
def take_action(self, observation):
c = random.random()
self.reduce_epsilon()
if c > self.e:
observation = torch.FloatTensor(observation)[None,]
with torch.no_grad():
q_values = self.current_network(observation)
action = torch.argmax(q_values).item()
return action
else:
return random.randrange(0, self.n_actions)
def learn(self):
replay_experience = self.sample_experience()
# I dont have to specifically train only current network in the beginning it wud be just
# as random as the target network
replay_experience = list(zip(*replay_experience))
observations = torch.FloatTensor(replay_experience[0])
actions = torch.FloatTensor(replay_experience[1])
rewards = torch.FloatTensor(replay_experience[2])
next_observations = torch.FloatTensor(replay_experience[3])
non_terminal_state = 1 - (torch.BoolTensor(replay_experience[4])).type(torch.float)
# Loss = { target_value - actual_value}^2
# Calculating target value using target_network
with torch.no_grad():
future_rewards = self.d * torch.max(self.target_network(next_observations).detach(), dim=1)[
0] * non_terminal_state
target_value = rewards + future_rewards
target_value = target_value.unsqueeze(-1)
self.optim.zero_grad()
z = self.current_network(observations).gather(1, actions.unsqueeze(1).long())
loss = F.mse_loss(z, target_value)
loss.backward()
# clipping gradients..clipping helps in faster rewards
torch.nn.utils.clip_grad_norm(self.current_network.parameters(), self.gradient_clip)
self.optim.step()
self.loss.append(loss)
return loss.item()
if __name__ == "__main__":
# define system parameters
(discounted_factor, epsilon_start, epsilon_decay, epsilon_end, observation_size, no_of_actions, batch_size,
learning_rate,
max_memory_size, max_steps) = \
(0.99, 1.0, 0.995, 0.01, 8, 4, 64, 5e-4, 100, 1000)
# define env
env = gym.make('LunarLander-v2')
# define agent
agent = DQNAgentv3(discounted_factor, epsilon_start, epsilon_decay, epsilon_end, observation_size, no_of_actions,
batch_size, learning_rate,
max_memory_size)
# train the Q network
total_games = 350
scores = []
avg_score = []
avg_loss = []
game_loss = []
# train loop
for i in range(total_games):
observation = env.reset()
done = False
score = 0
t = 0
while not done and t < max_steps:
action = agent.take_action(observation)
next_observation, reward, done, _ = env.step(action)
agent.store_experience((observation, action, reward, next_observation, done))
score += reward
agent.step_counter += 1
observation = next_observation
t += 1
if agent.step_counter < batch_size:
continue
else:
loss = agent.learn()
game_loss.append(loss)
if agent.learn_counter == 4:
agent.switch_networks()
agent.learn_counter = 0
agent.learn_counter += 1
print(f' Total reward for game {i} is {score}')
scores.append(score)
if len(game_loss) > 0:
avg_loss.append(sum(game_loss) / len(game_loss))
game_loss = []
# Plotting average reward
plt.figure(figsize=(8, 6))
plt.savefig("figures/DQN_Lunar_lander_rewards.png")
plt.title("Reward of DQN agent on LunarLander-v2 for each game")
plt.plot(range(len(scores)), scores)
plt.show()
# Plotting average loss
# its not necessary that the loss converges because it is not in sync with actual training
# every time we observe loss, we are calculating for instances from replaybuffer
plt.figure(figsize=(8, 6))
plt.title("Average Loss of DQN agent on LunarLander-v2")
plt.plot(range(len(avg_loss)), avg_loss)
plt.show()
# test policy
observation = env.reset()
test_episodes = 20
for i in range(test_episodes):
done = False
observation = env.reset()
while not done:
env.render()
time.sleep(1e-3)
action = agent.take_action(observation)
observation, r, done, _ = env.step(action)
class DQNAgent:
"""
Doesn't converge.
"""
def __init__(self, discounted_factor, epsilon_start, epsilon_decay, epsilon_end, observation_size, no_of_actions,
batch_size, learning_rate, max_memory_size):
'''
:param discounted_factor: discount factor for future rewards
:param epsilon_start: starting value of epsilon in e-greedy exploration
:param epsilon_decay: rate of epsilon decay
:param epsilon_end: ending value of epsilon
:param observation_size: state size or observation size or input size
:param no_of_actions: total number of discrete actions
:param batch_size: batch size for experience replay
:param max_memory_size: total size of memory buffer
'''
self.d = discounted_factor
self.e = epsilon_start
self.e_decay = epsilon_decay
self.e_end = epsilon_end
self.o_size = observation_size
self.n_actions = no_of_actions
self.batch_size = batch_size
self.memory = collections.deque(maxlen=max_memory_size)
self.lr = learning_rate
# Initializing two networks with simple (Wx + b)..no non-linearity
self.target_network = torch.randn(self.o_size + 1, self.n_actions, dtype=torch.float) * math.sqrt(
2 / (self.o_size + 1))
self.current_network = torch.randn(self.o_size + 1, self.n_actions, dtype=torch.float) * math.sqrt(
2 / (self.o_size + 1))
self.loss = []
self.step_counter = 0
self.learn_counter = 0
def store_experience(self, experience):
# observation, action, reward, next_observation, done = experience
self.memory.appendleft(experience)
def sample_experience(self):
# don't forget to put check for len(deque)< batch_size
return random.sample(self.memory, self.batch_size)
def switch_networks(self):
# Switch target and current network after every 10 games
self.target_network.data = self.current_network.clone()
def reduce_epsilon(self):
# Putting this in take_action rather than learn
self.e = self.e - self.e_decay if self.e > self.e_end else self.e_end
def take_action(self, observation):
# don't forget to decay epsilon
c = random.random()
self.reduce_epsilon()
if c > self.e:
observation = appendOnes(torch.FloatTensor(observation))[None,]
actions = observation.mm(self.current_network)
action = torch.argmax(actions).item()
return action
else:
return random.randrange(0, self.n_actions)
def learn(self):
replay_experience = self.sample_experience()
# I dont have to specifically train only current network in the beginning it wud be just
# as random as the target network
replay_experience = list(zip(*replay_experience))
observations = torch.FloatTensor(replay_experience[0])
actions = torch.FloatTensor(replay_experience[1])
rewards = torch.FloatTensor(replay_experience[2])
next_observations = torch.FloatTensor(replay_experience[3])
non_terminal_state = 1 - (torch.BoolTensor(replay_experience[4])).type(torch.float)
# Appending observations for bias
observations = appendOnes(observations)
next_observations = appendOnes(next_observations)
# Loss = { target_value - actual_value}^2
# Calculating target value using target_network
future_rewards = self.d * torch.max(next_observations.mm(self.target_network), dim=1)[0] * non_terminal_state
target_value = rewards + future_rewards
# Calculating actual values
# Question: are we broadcasting target value to full action space/Q values and subtracting them
# error??
actual_value = observations.mm(self.current_network)
loss = MeanSquarredError(labels=target_value.unsqueeze(-1), targets=actual_value, batch_size=self.batch_size)
print(" Loss for step " + str(self.step_counter) + " = " + str(loss.item()))
self.loss.append(loss)
# gradients transpose of observation(batch_size* input dim+1)* (target- actual)(batch*size* output dim)
gradients = -(2 / self.batch_size) * observations.t().mm(
(target_value.unsqueeze(-1) - actual_value))
# apply gradients
self.current_network = SGD(parameters=self.current_network, gradients=gradients, learning_rate=self.lr)
class DQNAgentv2:
"""
Doesn't converge.
"""
def __init__(self, discounted_factor, epsilon_start, epsilon_decay, epsilon_end, observation_size, no_of_actions,
batch_size, learning_rate, max_memory_size, layer1_dim=50, layer2_dim=10):
'''
:param discounted_factor: discount factor for future rewards
:param epsilon_start: starting value of epsilon in e-greedy exploration
:param epsilon_decay: rate of epsilon decay
:param epsilon_end: ending value of epsilon
:param observation_size: state size or observation size or input size
:param no_of_actions: total number of discrete actions
:param batch_size: batch size for experience replay
:param max_memory_size: total size of memory buffer
'''
self.d = discounted_factor
self.e = epsilon_start
self.e_decay = epsilon_decay
self.e_end = epsilon_end
self.o_size = observation_size
self.n_actions = no_of_actions
self.batch_size = batch_size
self.memory = collections.deque(maxlen=max_memory_size)
self.lr = learning_rate
self.layer1_dim = layer1_dim
self.layer2_dim = layer2_dim
# Initializing two sets of network parameters
# Target network
self.Wt1 = torch.randn(self.o_size + 1, self.layer1_dim, dtype=torch.float) * math.sqrt(
2 / (self.o_size + 1))
self.Wt2 = torch.randn(self.layer1_dim + 1, self.layer2_dim, dtype=torch.float) * math.sqrt(
2 / (self.layer1_dim + 1))
self.Wt3 = torch.randn(self.layer2_dim + 1, self.n_actions, dtype=torch.float) * math.sqrt(
2 / (self.layer2_dim + 1))
self.target_network = [self.Wt1, self.Wt2, self.Wt3]
# Current network
self.Wc1 = torch.randn(self.o_size + 1, self.layer1_dim, dtype=torch.float) * math.sqrt(
2 / (self.o_size + 1))
self.Wc2 = torch.randn(self.layer1_dim + 1, self.layer2_dim, dtype=torch.float) * math.sqrt(
2 / (self.layer1_dim + 1))
self.Wc3 = torch.randn(self.layer2_dim + 1, self.n_actions, dtype=torch.float) * math.sqrt(
2 / (self.layer2_dim + 1))
self.current_network = [self.Wc1, self.Wc2, self.Wc3]
self.loss = []
self.step_counter = 0
self.learn_counter = 0
def store_experience(self, experience):
# observation, action, reward, next_observation, done = experience
self.memory.appendleft(experience)
def sample_experience(self):
# don't forget to put check for len(deque)< batch_size
return random.sample(self.memory, self.batch_size)
def switch_networks(self):
# Switch target and current network after every 10 games
for i, j in enumerate(self.current_network):
self.target_network[i] = j.clone()
def reduce_epsilon(self):
# Putting this in take_action rather than learn
self.e = self.e - self.e_decay if self.e > self.e_end else self.e_end
def take_action(self, observation):
# don't forget to decay epsilon
c = random.random()
self.reduce_epsilon()
if c > self.e:
observation = torch.FloatTensor(observation)[None,]
actions = self.forward(self.current_network, observation)
action = torch.argmax(actions).item()
return action
else:
return random.randrange(0, self.n_actions)
def forward(self, network, input_):
for count, i_ in enumerate(network):
input_ = appendOnes(input_)
z = input_.mm(i_)
if count != (len(network) - 1):
x = F.relu_(z)
input_ = x
return z
def forward_for_gradients(self, W1, W2, W3, input_, target_, actions):
# I know this is bloated implementation resulting in lower efficiency
layers = [W1, W2, W3]
for count, i_ in enumerate(layers):
input_ = appendOnes(input_)
z = input_.mm(i_)
if count != (len(layers) - 1):
x = F.relu_(z)
# x = z
input_ = x
z = z.gather(1, actions.unsqueeze(1).long())
# loss = MeanSquarredError(labels=target_.unsqueeze(-1), targets=z, batch_size=self.batch_size)
# try F.mse_loss and validate gradients
loss = F.mse_loss(z, target_.unsqueeze(-1))
return loss
def learn(self):
replay_experience = self.sample_experience()
# I dont have to specifically train only current network in the beginning it wud be just
# as random as the target network
replay_experience = list(zip(*replay_experience))
observations = torch.FloatTensor(replay_experience[0])
actions = torch.FloatTensor(replay_experience[1])
rewards = torch.FloatTensor(replay_experience[2])
next_observations = torch.FloatTensor(replay_experience[3])
non_terminal_state = 1 - (torch.BoolTensor(replay_experience[4])).type(torch.float)
# Loss = { target_value - actual_value}^2
# Calculating target value using target_network
with torch.no_grad():
future_rewards = self.d * torch.max(self.forward(self.target_network, next_observations), dim=1)[
0] * non_terminal_state
target_value = rewards + future_rewards
# Calculating actual values
# Now to get gradients I will have to pass each of current networks parameter
# to get gradients from vjp
inputs_for_gradients = (
self.current_network[0], self.current_network[1], self.current_network[2], observations, target_value, actions)
# Documentation of vjp: https://pytorch.org/docs/stable/autograd.html#torch.autograd.functional.vjp
# it gives gradient of inputs wrt to the function we pass to the vjp. I was experimenting with while trying to
# understand autograd
loss, gradients = vjp(self.forward_for_gradients, inputs_for_gradients)
print(" Loss for step " + str(self.step_counter) + " = " + str(loss.item()))
self.loss.append(loss)
for count, i__ in enumerate(gradients[0:3]):
self.current_network[count] = SGD(parameters=self.current_network[count], gradients=i__,
learning_rate=self.lr)