-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
Copy path405_DQN_reinforcement_learning.py
126 lines (103 loc) · 4.09 KB
/
405_DQN_reinforcement_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Know more, visit my Python tutorial page: https://morvanzhou.github.io/tutorials/
My Youtube Channel: https://www.youtube.com/user/MorvanZhou
More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
Dependencies:
tensorflow: 1.1.0
matplotlib
numpy
gym: 0.8.1
"""
import tensorflow as tf
import numpy as np
import gym
tf.set_random_seed(1)
np.random.seed(1)
# Hyper Parameters
BATCH_SIZE = 32
LR = 0.01 # learning rate
EPSILON = 0.9 # greedy policy
GAMMA = 0.9 # reward discount
TARGET_REPLACE_ITER = 100 # target update frequency
MEMORY_CAPACITY = 2000
MEMORY_COUNTER = 0 # for store experience
LEARNING_STEP_COUNTER = 0 # for target updating
env = gym.make('CartPole-v0')
env = env.unwrapped
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]
MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2)) # initialize memory
# tf placeholders
tf_s = tf.placeholder(tf.float32, [None, N_STATES])
tf_a = tf.placeholder(tf.int32, [None, ])
tf_r = tf.placeholder(tf.float32, [None, ])
tf_s_ = tf.placeholder(tf.float32, [None, N_STATES])
with tf.variable_scope('q'): # evaluation network
l_eval = tf.layers.dense(tf_s, 10, tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0, 0.1))
q = tf.layers.dense(l_eval, N_ACTIONS, kernel_initializer=tf.random_normal_initializer(0, 0.1))
with tf.variable_scope('q_next'): # target network, not to train
l_target = tf.layers.dense(tf_s_, 10, tf.nn.relu, trainable=False)
q_next = tf.layers.dense(l_target, N_ACTIONS, trainable=False)
q_target = tf_r + GAMMA * tf.reduce_max(q_next, axis=1) # shape=(None, ),
a_indices = tf.stack([tf.range(tf.shape(tf_a)[0], dtype=tf.int32), tf_a], axis=1)
q_wrt_a = tf.gather_nd(params=q, indices=a_indices) # shape=(None, ), q for current state
loss = tf.reduce_mean(tf.squared_difference(q_target, q_wrt_a))
train_op = tf.train.AdamOptimizer(LR).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
def choose_action(s):
s = s[np.newaxis, :]
if np.random.uniform() < EPSILON:
# forward feed the observation and get q value for every actions
actions_value = sess.run(q, feed_dict={tf_s: s})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, N_ACTIONS)
return action
def store_transition(s, a, r, s_):
global MEMORY_COUNTER
transition = np.hstack((s, [a, r], s_))
# replace the old memory with new memory
index = MEMORY_COUNTER % MEMORY_CAPACITY
MEMORY[index, :] = transition
MEMORY_COUNTER += 1
def learn():
# update target net
global LEARNING_STEP_COUNTER
if LEARNING_STEP_COUNTER % TARGET_REPLACE_ITER == 0:
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_next')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q')
sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
LEARNING_STEP_COUNTER += 1
# learning
sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
b_memory = MEMORY[sample_index, :]
b_s = b_memory[:, :N_STATES]
b_a = b_memory[:, N_STATES].astype(int)
b_r = b_memory[:, N_STATES+1]
b_s_ = b_memory[:, -N_STATES:]
sess.run(train_op, {tf_s: b_s, tf_a: b_a, tf_r: b_r, tf_s_: b_s_})
print('\nCollecting experience...')
for i_episode in range(400):
s = env.reset()
ep_r = 0
while True:
env.render()
a = choose_action(s)
# take action
s_, r, done, info = env.step(a)
# modify the reward
x, x_dot, theta, theta_dot = s_
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
r = r1 + r2
store_transition(s, a, r, s_)
ep_r += r
if MEMORY_COUNTER > MEMORY_CAPACITY:
learn()
if done:
print('Ep: ', i_episode,
'| Ep_r: ', round(ep_r, 2))
if done:
break
s = s_