-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlunar_landar.py
42 lines (33 loc) · 1.09 KB
/
lunar_landar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gym
import numpy as np
from scipy.special import softmax
env = gym.make('LunarLander-v2')
alpha = 0.01
num_states = len(env.observation_space.high)
num_actions = env.action_space.n
theta = np.random.normal(0, 1, size=(num_actions, num_states))
# print(theta)
for i_episode in range(2000):
hist = []
vt = 0
s = env.reset() # reset for each new trial
for t in range(10000):
env.render()
pi = softmax(np.matmul(theta, s))
# print(pi)
a = np.random.choice(np.arange(len(pi)), p=pi)
s_prime, r, done, info = env.step(a)
vt += r
hist.append((s, a, r))
if done:
print("Episode finished after {} timesteps".format(t+1))
break
else:
s = s_prime
for (s, a, r) in hist:
theta[a, :] += alpha * np.mean(theta)
# print(s, a, r)
# Goal is to map from states to actions with some feature vector and weights.
# Features can just be all of the states, actions are the set of actions
# If there are m actions, and n state values, then then each action will
# have n weights