forked from axnedergaard/normalized-advantage-functions
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnaf.py
232 lines (188 loc) · 8.98 KB
/
naf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#Based on 'Continuous Deep Q-learning with Model Based Acceleration' by Gu et al, 2016. Available from: https://arxiv.org/pdf/1603.00748.pdf
#TODO
#investigate and fix nan action bug
#confirm saver working
#additional observation/action space support (continuous)
#experiment with different advantage functions/covariance matrices for advantage function
#more clever exploration policy
#adaptive batch size?
#memories weighted by information/loss?
#improved network initialisation?
from baselines.deepqnaf import naf
from baselines import logger
import tensorflow as tf
import numpy as np
import random
from tensorflow.python.ops.distributions.util import fill_triangular
class Memory:
def __init__(self, capacity, batch_size, v):
self.m = []
self.ready = 0
self.full = 0
self.capacity = capacity
self.batch_size = batch_size
self.v = v
def store(self,d):
[s,a,r,s_next,terminal] = d
self.m.append([s,a,r,s_next,terminal])
if self.full:
self.m.pop(0)
if not self.ready and len(self.m) >= self.batch_size:
self.ready = 1
if self.v > 0:
print("[Memory ready]")
def sample(self):
return random.sample(self.m, self.batch_size)
class Layer:
def __init__(self, input_layer, out_n, activation=None, batch_normalize=False):
x = input_layer
batch_size, in_n = np.shape(x)
in_n = int(in_n)
if batch_normalize:
variance_epsilon = 0.000001
decay = 0.999
self.gamma = tf.Variable(tf.constant(1,shape=[in_n],dtype=tf.float32), trainable=True)
self.beta = tf.Variable(tf.constant(0,shape=[in_n],dtype=tf.float32), trainable=True)
self.moving_mean = tf.Variable(tf.constant(0,shape=[in_n],dtype=tf.float32), trainable=False)
self.moving_var = tf.Variable(tf.constant(1,shape=[in_n],dtype=tf.float32), trainable=False)
mean,var = tf.nn.moments(x, axes=[0])
update_mean = self.moving_mean.assign(decay*self.moving_mean + (1-decay)*mean)
update_var = self.moving_mean.assign(decay*self.moving_var + (1-decay)*var)
with tf.control_dependencies([update_mean, update_var]):
x = tf.nn.batch_normalization(x, self.moving_mean, self.moving_var, self.beta, self.gamma, variance_epsilon)
self.w = tf.Variable(tf.random_uniform([in_n,out_n],-0.1,0.1), trainable=True)
self.b = tf.Variable(tf.random_uniform([out_n],-0.1,0.1), trainable=True)
self.z = tf.matmul(x, self.w) + self.b
if activation is not None:
self.h = activation(self.z)
else:
self.h = self.z
self.variables = [self.w, self.b]
if batch_normalize:
self.variables += [self.gamma, self.beta, self.moving_mean, self.moving_var]
def construct_update(self, from_layer, tau):
update = []
for x,y in zip(self.variables, from_layer.variables):
update += [x.assign(x*tau + (1-tau)*y)]
return update
class Agent:
def __init__(self, v, observation_space, action_space, learning_rate, batch_normalize, gamma, tau, epsilon, hidden_size, hidden_n, hidden_activation, batch_size, memory_capacity, load_path, covariance):
self.v = v
self.memory = Memory(memory_capacity,batch_size,v)
self.observation_space = observation_space
self.action_space = action_space
self.state_n = observation_space.shape[0]
self.action_n = action_space.shape[0]
self.learning_rate = learning_rate
self.gamma = gamma
self.tau = tau
self.epsilon = epsilon
self.resets = 0
H_layer_n = hidden_n
H_n = hidden_size
M_n = int((self.action_n)*(self.action_n+1)/2)
V_n = 1
mu_n = self.action_n
tf.reset_default_graph()
#neural network architecture
self.x = tf.placeholder(shape=[None,self.state_n], dtype=tf.float32, name="state")
self.u = tf.placeholder(shape=[None,self.action_n], dtype=tf.float32, name="action")
self.target = tf.placeholder(shape=[None,1], dtype=tf.float32, name="target")
self.H = Layer(self.x, H_n, activation=hidden_activation, batch_normalize=batch_normalize)
self.t_H = Layer(self.x, H_n, activation=hidden_activation, batch_normalize=batch_normalize) #target
self.updates = self.t_H.construct_update(self.H, self.tau)
for i in range(1,H_layer_n):
self.H = Layer(self.H.h, H_n, activation=hidden_activation, batch_normalize=batch_normalize)
self.t_H = Layer(self.t_H.h, H_n, activation=hidden_activation, batch_normalize=batch_normalize) #target
self.updates += self.t_H.construct_update(self.H, self.tau)
self.V = Layer(self.H.h, V_n, batch_normalize=batch_normalize)
self.t_V = Layer(self.t_H.h, V_n, batch_normalize=batch_normalize) #target
self.updates += self.t_V.construct_update(self.V, self.tau)
self.mu = Layer(self.H.h, mu_n, activation=tf.nn.tanh, batch_normalize=batch_normalize)
if covariance == "identity": #identity covariance
self.P = tf.eye(self.action_n,batch_shape=[tf.shape(self.x)[0]]) #identity covariance
elif covariance == "diagonal": #diagonal covariance with nn inputs
self.O = Layer(self.H.h, mu_n, batch_normalize=batch_normalize) #nn input to diagonal covariance
self.P = tf.matrix_set_diag(tf.eye(self.action_n,batch_shape=[tf.shape(self.x)[0]]), self.O.h) #diagonal covariance
else: #original NAF covariance
self.M = Layer(self.H.h, M_n, activation=tf.nn.tanh, batch_normalize=batch_normalize)
self.N = fill_triangular(self.M.h)
self.L = tf.matrix_set_diag(self.N, tf.exp(tf.matrix_diag_part(self.N)))
self.P = tf.matmul(self.L, tf.matrix_transpose(self.L)) #original NAF covariance
#self.P_inverse = tf.matrix_inverse(self.P) #precision matrix for exploration policy
self.D = tf.reshape(self.u - self.mu.h, [-1,1,self.action_n])
self.A = (-1.0/2.0)*tf.reshape(tf.matmul(tf.matmul(self.D, self.P), tf.transpose(self.D, perm=[0,2,1])), [-1,1]) #advantage function
self.Q = self.A + self.V.h
self.loss = tf.reduce_sum(tf.square(self.target - self.Q))
self.optimiser = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
self.sess = tf.Session()
init = tf.global_variables_initializer()
self.sess.run(init)
self.saver = tf.train.Saver()
if load_path is not None:
self.saver.restore(self.sess, load_path)
def reset(self): #reset in between episodes (for example update epsilon), i is episode number
if self.memory.ready:
self.resets = self.resets + 1
i = self.resets
# self.epsilon = 1.0/(1+i)
self.epsilon = 1.0/(1.0+0.1*i+(1.0/(i+1))*np.log(i)) #derived through black magic for inverted double pendulum
if self.v > 1:
print("[Update epsilon: " + str(self.epsilon) + "]")
#self.epsilon = 1.0/(np.log(i+1)/np.log(3) + 0.001) #derived through black magic for inverted double pendulum 2
def save(self, path):
self.saver.save(self.sess, path)
def get_action(self,s):
mu = self.sess.run(self.mu.h, feed_dict={self.x:np.reshape(s,[1,-1])})[0]
#random action with probability epsilon
if np.random.rand() < self.epsilon:
action = np.random.rand(self.action_n)*2-1 #random action
else:
action = mu
return action
# covariance = np.eye(self.action_n)
# return self.noise(mu, covariance)
#mu,p_inv = self.sess.run([self.mu.h,self.P_inverse],feed_dict={self.x:np.reshape(s,[1,-1])})[0]
#return self.noise(mu, p_inv)
def observe(self,state,action,reward,state_next,terminal):
self.memory.store((state,action,reward,state_next,terminal))
def learn(self):
if self.memory.ready:
batch_target = []
batch_state = []
batch_action = []
batch_reward = []
batch_state_next = []
batch_terminal = []
for [t_s,t_a,t_r,t_s_next,t_terminal] in self.memory.sample():
batch_state_next += [t_s_next]
batch_state += [t_s]
batch_action += [t_a]
batch_reward += [t_r]
batch_terminal += [t_terminal]
batch_target = self.get_target(batch_action, batch_reward, batch_state_next, batch_terminal)
#l,a,self.p_inv = self.backprop(batch_state, batch_action, batch_target)
l,a = self.backprop(batch_state, batch_action, batch_target)
self.update_target()
if (self.v > 2):
print(np.mean(a))
def noise(self, mean, covariance):
return np.random.multivariate_normal(mean,self.epsilon*covariance)
def scale(self,actions, low, high): #assume domain [-1,1]
actions = np.clip(actions, -1, 1)
scaled_actions = []
for a in actions:
scaled_actions += [(a+1)*(high-low)/2+low]
return np.reshape(scaled_actions,[-1]) #range [low,high]
def get_target(self,a,r,s_next,terminal):
targets = np.reshape(r,[-1,1]) + np.reshape(self.gamma*self.sess.run(self.t_V.h,feed_dict={self.x:s_next,self.u:a}),[-1,1])
for i in range(len(terminal)):
if terminal[i]:
targets[i] = r[i]
return targets
def backprop(self,batch_state,batch_action,batch_target):
l,a,_ = self.sess.run([self.loss,self.A,self.optimiser],feed_dict={self.x:batch_state, self.target:batch_target, self.u:batch_action})
return l,a
def update_target(self):
for update in self.updates:
self.sess.run(update)