-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpg-pong.py
140 lines (121 loc) · 6.36 KB
/
pg-pong.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import _pickle as pickle #old code: import cPickle as pickle
import gym
# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = True
# model initialization
D = 80 * 80 # input dimensionality: 80x80 grid
if resume:
model = pickle.load(open('save.p', 'rb'))
else:
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
#H is the number of hidden layer nurons, D is dimensionality of the grid
model['W2'] = np.random.randn(H) / np.sqrt(H)
#both grad_buffer and rmsprop_cache are initialized to 0s
#old code: grad_buffer = [ k:np.zeros_like(v) for k,v in model.iteritems()]
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
#old code: rmsprop_cache = [ k:np.zeros_like(v) for k,v in model.iteritems()]
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]
def prepro(I):#preprocessing step - I is the game frame convert it into paddles and ball
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[35:195] # crop to include row 34th to 194
I = I[::2,::2,0] # downsample by factor of 2 - size 80x80 grid
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float).ravel()#flatten the 2d into a 1d array vector.
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0 #sum of the reward
#old code: for t in reversed(xrange(0,f.size)):
for t in reversed(range(0, r.size)):
if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + r[t] #increment sum weight the reward exponentially because we focus on the current reward instead of future reward
discounted_r[t] = running_add
return discounted_r
def policy_forward(x):
h = np.dot(model['W1'], x)
h[h<0] = 0 # ReLU nonlinearity - any value that less than 0 will be zero otherwise stay the same
logp = np.dot(model['W2'], h)
p = sigmoid(logp)
return p, h # return probability of taking action 2, and hidden state
def policy_backward(eph, epdlogp):
""" backward pass. (eph is array of intermediate hidden states) """
dW2 = np.dot(eph.T, epdlogp).ravel()
dh = np.outer(epdlogp, model['W2'])#3\derivative of hidden state
dh[eph <= 0] = 0 # backpro prelu
dW1 = np.dot(dh.T, epx)
return {'W1':dW1, 'W2':dW2}
#implementation details
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference between frames because frames are static
xs,hs,dlogps,drs = [],[],[],[] #oservation, hidden states, the gradient and the rewards respectively
running_reward = None
reward_sum = 0
episode_number = 0
while True:
if render: env.render()
# preprocess the observation, set input to network to be difference image
cur_x = prepro(observation)
x = cur_x - prev_x if prev_x is not None else np.zeros(D)
prev_x = cur_x # x is the image different the cur now is the prev
# forward the policy network and sample an action from the returned probability
aprob, h = policy_forward(x)
action = 2 if np.random.uniform() < aprob else 3 # roll the dice!
#
# record various intermediates (needed later for backprop)
xs.append(x) # observation
hs.append(h) # hidden state
y = 1 if action == 2 else 0 # a "fake label"
dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
if done: # an episode finished
episode_number += 1
# stack together all inputs, hidden states, action gradients, and rewards for this episode
epx = np.vstack(xs)
eph = np.vstack(hs)
epdlogp = np.vstack(dlogps)
epr = np.vstack(drs)
xs,hs,dlogps,drs = [],[],[],[] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# normalization standardize the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr -= np.mean(discounted_epr) # (r-mean)/std
discounted_epr /= np.std(discounted_epr)
epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
grad = policy_backward(eph, epdlogp)
for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch
# perform rmsprop parameter update every batch_size episodes
if episode_number % batch_size == 0:
#old code: for k,v in model.iteritems()
for k,v in model.items():
g = grad_buffer[k] # gradient
rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2 #RMS propogatio formula
model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer
# boring book-keeping
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
#old code: print 'resetting env.espisode reward total was %f. running mean : %f' % (reward_sum, running_reward)
print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
reward_sum = 0
observation = env.reset() # reset env
prev_x = None
if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
#old code: print 'ep %d: game finished, reward: %f' % (episode_number, reward) + '' if reward == -1 else ' !!!!!!!!'
print ('ep %d: game finished, reward: %f' % (episode_number, reward), '' if reward == -1 else ' !!!!!!!!')