-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathForwardSarsaLambda.py
165 lines (138 loc) · 6.88 KB
/
ForwardSarsaLambda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import logging
from collections import deque
from typing import List
import numpy as np
from agents import Agent
from project import Project
class ForwardSarsaLambda(Agent):
"""Implementation of the Forward Sarsa(λ) algorithm.
This algorithm is based on the true online Sarsa(λ) algorithm by
van Seijen, Harm; Sutton, Richard S. but a more efficient implementation
to use with deep neural networks.
Some related implementations:
* Episodic Semi-gradient Sarsa for Estimating q using NN with Keras:
* https://gist.github.com/FlashTek/0dfddf46c4d50c4e068f1ecbad1d03b5
* https://chat.stackoverflow.com/rooms/150499/discussion-between-flashtek-and-neil-slater
* https://stackoverflow.com/questions/45377404/episodic-semi-gradient-sarsa-with-neural-network/45391630#45391630
* Sarsa Keras
* https://github.com/keras-rl/keras-rl/blob/master/rl/agents/sarsa.py
* Python implementations of Sutton, Barto (2018) (incl. TD(λ))
* https://github.com/ShangtongZhang/reinforcement-learning-an-introduction
* https://github.com/ShangtongZhang/reinforcement-learning-an-introduction/blob/ed1ddb0c40e37a8846665f26f01e0522b232634f/chapter12/RandomWalk.py
* Sarsa(λ)
* https://github.com/codebox/sarsa-lambda/blob/master/strategy.py
* TD gammon
* https://github.com/fomorians/td-gammon/blob/master/model.py
* True Online TD Lambda with Fourier Basis
* https://github.com/EllaBot/true-online-td-lambda/blob/master/true_online_td_lambda/true_online_td_lambda.py
"""
def __init__(self, episodes, projects: List[Project], model, lam=0.8,
gamma=0.95, eta=0.01, epsilon=1.0):
""" Initialises the algorithm's parameters.
:param episodes:
:param projects:
:param model: Class with a Keras or TensorFlow model named model
:param lam: The trace-decay parameter λ ϵ [0, 1] determines the rate
at which the eligibility trace decays.
λ = 0 results in a one-step TD return
λ = 1 results in Monte Carlo behaviour
:param eta: η will typically be set to 0.01 or larger
:param epsilon: greediness or exploration parameter ϵ ϵ [0, 1]. ϵ should
start at a high value close or exactly 1 and decrease.
"""
super().__init__(projects=projects, model=model)
self.lam = lam
self.gamma = gamma
self.epsilon = epsilon
self.episodes = episodes * len(projects)
self.K = int(np.ceil(np.log(eta) / np.log(gamma * lam)))
self.model = model.model
self.c_final = (gamma * lam) ** (self.K - 1)
self.F = deque(maxlen=self.K)
def __repr__(self):
# TODO update the representation to fit the final version of the algorithm
return "TrueOnlineSarsaLambda({}, {})".format(self.lam, self.gamma)
def train(self):
"""Uses the Forward Sarsa(λ) algorithm to train the model."""
for episode in range(self.episodes):
if ((episode + 1) % len(self.projects)) == 0:
logging.info('episode: {}'.format((episode + 1) / len(self.projects)))
self.model.save_weights('.\\models\\' + self.model_name + '-' +
str(int(episode / len(self.projects))) + '.h5')
target_sync = 0
i = 0
c = 1
v_current = 0
ready = False
t = 0
while not self.project.is_finished():
value, best_action, durations, state = self.act()
t += self.project.next(best_action, durations)
if self.project.is_finished():
value_next = 0
reward = 1 / t
else:
value_next, action_next, durations_next, _ = self.act()
reward = 0
rho = reward + self.gamma * (1 - self.lam) * value_next # see equation (7)
self.F.appendleft((state, rho)) # push tuple ‹S, ρ› on F
delta = reward + self.gamma * value_next - value
'''"We can avoid these blow-ups in an elegant way, by recomputing the K-bounded
λ-return from scratch every K time steps.", cf. p. 6'''
if i == self.K - 1:
target = target_sync
target_sync = v_current
i = 0
c = 1
ready = True
else:
target_sync += c * delta
i += 1
c *= (self.gamma * self.lam)
if ready:
target += self.c_final * delta
state_pop, rho_pop = self.F.pop()
try:
self.model.fit(x=state_pop, y=target, verbose=0)
except ValueError as e:
print('Value Error!')
print('x:', state_pop)
print('y:', target)
print(e)
if self.K != 1:
target = (target - rho_pop) / (self.gamma * self.lam)
# results in a 5% epsilon in the last episode
self.epsilon = self.epsilon * np.exp(-(2.99573227355 / self.episodes))
if not ready:
target = target_sync
while len(self.F) > 0:
value_pop, rho_pop = self.F.pop()
callbacks = [self.tensorboard] if len(self.F) == 1 else None
self.model.fit(x=value_pop, y=target, verbose=0, callbacks=callbacks)
if self.K != 1:
target = (target - rho_pop) / (self.gamma * self.lam)
logging.debug('epsilon:', int(self.epsilon))
self.next_project()
def act(self):
"""Taking actions is done in an ϵ-greedy fashion to allow sufficient exploration.
With a chance of ϵ the action is taken randomly and with
(1-ϵ) the action with the highest value will be executed. ϵ is usually
decreasing during training to converge the policy's decision making
process.
:return: value, best action, durations, state action
"""
state, durations = self.project.state()
actions = self.project.get_actions()
if len(actions) > 1:
if np.random.rand() < self.epsilon:
best_action = np.random.choice(actions)
state_action = self.input_vector(state, best_action)
value = self.model.predict(state_action)
else:
value, best_action, state_action = self.get_best_action(state, actions)
else: # there is only the "do-nothing/wait" action
state_action = self.input_vector(state, None)
value = self.model.predict(state_action)
best_action = []
durations = {}
return value, best_action, durations, state_action