-
Notifications
You must be signed in to change notification settings - Fork 1
/
rl_model.py
203 lines (166 loc) · 8.12 KB
/
rl_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# import useful Python libraries
import numpy as np
import threading
import os
import tensorflow as tf
from keras.models import Model, clone_model
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Input
from keras.optimizers import Adam
import keras.backend as k
from keras.initializers import random_normal
# prevent TensorFlow from allocating the entire GPU at the start of the program
# initiate GPU for model training
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
k.set_session(session)
weights_path = '\\pretrain_model_weights.h5'
class RlModel:
def __init__(self, weights_path, train_conv_layers):
self.__angle_values = [-1, -0.5, 0, 0.5, 1]
self.__nb_actions = 5
self.__gamma = 0.99
# Define the model
activation = 'relu'
pic_input = Input(shape=(59, 255, 3))
img_stack = Conv2D(16, (3, 3), name='convolution0', padding='same', activation=activation,
trainable=train_conv_layers)(pic_input)
img_stack = MaxPooling2D(pool_size=(2, 2))(img_stack)
img_stack = Conv2D(32, (3, 3), activation=activation, padding='same', name='convolution1',
trainable=train_conv_layers)(img_stack)
img_stack = MaxPooling2D(pool_size=(2, 2))(img_stack)
img_stack = Conv2D(32, (3, 3), activation=activation, padding='same', name='convolution2',
trainable=train_conv_layers)(img_stack)
img_stack = MaxPooling2D(pool_size=(2, 2))(img_stack)
img_stack = Flatten()(img_stack)
img_stack = Dropout(0.2)(img_stack)
img_stack = Dense(128, name='rl_dense', kernel_initializer=random_normal(stddev=0.01))(img_stack)
img_stack = Dropout(0.2)(img_stack)
output = Dense(self.__nb_actions, name='rl_output', kernel_initializer=random_normal(stddev=0.01))(img_stack)
opt = Adam()
self.__action_model = Model(inputs=[pic_input], outputs=output)
self.__action_model.compile(optimizer=opt, loss='mean_squared_error')
self.__action_model.summary()
# load pre-trained weights for convolutional neural network
if weights_path is not None and len(weights_path) > 0:
print('Loading weights from my_model_weights.h5...')
print('Current working dir is {0}'.format(os.getcwd()))
self.__action_model.load_weights(weights_path, by_name=True)
print('First layer: ')
w = np.array(self.__action_model.get_weights()[0])
print(w)
else:
print('Not loading weights')
# Set up the target model, allows the model to converge more rapidly.
self.__action_context = tf.get_default_graph()
self.__target_model = clone_model(self.__action_model)
self.__target_context = tf.get_default_graph()
self.__model_lock = threading.Lock()
# reads the trained model weights from the file
def from_packet(self, packet):
with self.__action_context.as_default():
self.__action_model.set_weights([np.array(w) for w in packet['action_model']])
self.__action_context = tf.get_default_graph()
if 'target_model' in packet:
with self.__target_context.as_default():
self.__target_model.set_weights([np.array(w) for w in packet['target_model']])
self.__target_context = tf.get_default_graph()
# transfer finished model to agent
def to_packet(self, get_target=True):
packet = {}
with self.__action_context.as_default():
packet['action_model'] = [w.tolist() for w in self.__action_model.get_weights()]
self.__action_context = tf.get_default_graph()
if get_target:
with self.__target_context.as_default():
packet['target_model'] = [w.tolist() for w in self.__target_model.get_weights()]
return packet
# Updates the model with the supplied gradients
# This is used by the trainer to accept a training iteration update from the agent
def update_with_gradient(self, gradients, should_update_critic):
with self.__action_context.as_default():
action_weights = self.__action_model.get_weights()
if len(action_weights) != len(gradients):
raise ValueError('len of action_weights is {0}, but len gradients is {1}'.format(len(action_weights),
len(gradients)))
print('UDPATE GRADIENT DEBUG START')
dx = 0
for i in range(0, len(action_weights), 1):
action_weights[i] += gradients[i]
dx += np.sum(np.sum(np.abs(gradients[i])))
print('Moved weights {0}'.format(dx))
self.__action_model.set_weights(action_weights)
self.__action_context = tf.get_default_graph()
if should_update_critic:
with self.__target_context.as_default():
print('Updating critic')
self.__target_model.set_weights([np.array(w, copy=True) for w in action_weights])
print('UPDATE GRADIENT DEBUG END')
def update_critic(self):
with self.__target_context.as_default():
self.__target_model.set_weights([np.array(w, copy=True) for w in self.__action_model.get_weights()])
# Given a set of training data, trains the model and determine the gradients.
# The agent will use this to compute the model updates to send to the trainer
def get_gradient_update_from_batches(self, batches):
pre_states = np.array(batches['pre_states'])
post_states = np.array(batches['post_states'])
rewards = np.array(batches['rewards'])
actions = list(batches['actions'])
is_not_terminal = np.array(batches['is_not_terminal'])
# For now, our model only takes a single image in as input
pre_states = pre_states[:, 3, :, :, :]
post_states = post_states[:, 3, :, :, :]
print('START GET GRADIENT UPDATE DEBUG')
# We only have labels for the action that the agent actually took.
# To prevent the model from training the other actions, figure out what the model currently predicts for each input.
# Then, the gradients with respect to those outputs will always be zero.
with self.__action_context.as_default():
labels = self.__action_model.predict([pre_states], batch_size=32)
# Find out what the target model will predict for each post-decision state.
with self.__target_context.as_default():
q_futures = self.__target_model.predict([post_states], batch_size=32)
# Apply the Bellman equation
q_futures_max = np.max(q_futures, axis=1)
q_labels = (q_futures_max * is_not_terminal * self.__gamma) + rewards
# Update the label only for the actions that were actually taken.
for i in range(0, len(actions), 1):
labels[i][actions[i]] = q_labels[i]
# Perform a training iteration.
with self.__action_context.as_default():
original_weights = [np.array(w, copy=True) for w in self.__action_model.get_weights()]
self.__action_model.fit([pre_states], labels, epochs=1, batch_size=32, verbose=1)
# Compute the gradients
new_weights = self.__action_model.get_weights()
gradients = []
dx = 0
for i in range(0, len(original_weights), 1):
gradients.append(new_weights[i] - original_weights[i])
dx += np.sum(np.sum(np.abs(new_weights[i] - original_weights[i])))
print('change in weights from training iteration: {0}'.format(dx))
print('END GET GRADIENT UPDATE DEBUG')
# Numpy arrays are not JSON serializable by default
return [w.tolist() for w in gradients]
# Performs a state prediction given the model input
def predict_state(self, observation):
if type(observation) == type([]):
observation = np.array(observation)
# Our model only predicts on a single state.
# Take the latest image
observation = observation[3, :, :, :]
observation = observation.reshape(1, 59, 255, 3)
with self.__action_context.as_default():
predicted_qs = self.__action_model.predict([observation])
# Select the action with the highest Q value
predicted_state = np.argmax(predicted_qs)
return predicted_state, predicted_qs[0][predicted_state]
# Convert the current state to control signals to drive the car.
# As we are only predicting steering angle, we will use a simple controller to keep the car at a constant speed
def state_to_control_signals(self, state, car_state):
if car_state.speed > 9:
return self.__angle_values[state], 0, 1
else:
return self.__angle_values[state], 1, 0
# Gets a random state
# Used during annealing
def get_random_state(self):
return np.random.randint(low=0, high=self.__nb_actions - 1)