-
Notifications
You must be signed in to change notification settings - Fork 0
/
pomdpagent.py
215 lines (160 loc) · 7.53 KB
/
pomdpagent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import random
import numpy as np
import math
from Interfaces import Agent
#from planner import RoutePlanner
from PomdpenvironmentConverter import POMDPEnvironmentConverter
import copy
class LearningAgent(Agent):
""" An agent that learns to drive in the Smartcab world.
This is the object you will be modifying. """
def __init__(self, env, environment_type, learning=False, epsilon=1.0, alpha=0.5, resolution = 4, ):
super(LearningAgent, self).__init__(env) # Set the agent in the evironment
#self.planner = RoutePlanner(self.env, self) # Create a route planner
self.valid_actions = self.env.valid_actions # The set of valid actions
environment_copy = environment_type.copy(env)
self.environmentmodel = POMDPEnvironmentConverter(environment_copy,resolution) #the converter needs a model of the environment to make a few calculations.
# Set parameters of the learning agent
self.learning = learning # Whether the agent is expected to learn
self.Q = dict() # Create a Q-table which will be a dictionary of tuples
self.epsilon = epsilon # Random exploration factor
self.alpha = alpha # Learning factor
self.T = 1
self.weight = 0.0005
self.color = 'green'
###########
## TO DO ##
###########
# Set any additional class parameters as needed
def reset(self, testing=False):
""" The reset function is called at the beginning of each trial.
'testing' is set to True if testing trials are being used
once training trials have completed. """
# Select the destination as the new location to route to
#self.planner.route_to(destination)
###########
## TO DO ##
###########
# Update epsilon using a decay function of your choice
# Update additional class parameters as needed
# If 'testing' is True, set epsilon and alpha to 0
#self.epsilon = math.cos(self.T*self.alpha)
#self.epsilon = self.alpha**self.T
#self.epsilon = self.epsilon - 0.05
self.environmentmodel.reset()
if testing:
self.epsilon = 0.0
self.alpha = 0.0
else:
#This is an adjusted sigmoid function
threshold = 0.0004
smallnum = 0.00001
adjustment = (math.log(threshold)/(math.log(smallnum)*self.weight))
self.epsilon = 1 / (1 + smallnum**(-self.weight*(self.T-adjustment)))
#end sigmoid
self.T = self.T + 1
return None
def build_state(self):
""" The build_state function is called when the agent requests data from the
environment. The next waypoint, the intersection inputs, and the deadline
are all features available to the agent. """
# Collect data about the environment
#waypoint = self.planner.next_waypoint() # The next waypoint
inputs = self.environmentmodel.getState() # Visual input - intersection light and traffic
#deadline = self.env.get_deadline(self) # Remaining deadline
#print inputs
###########
## TO DO ##
###########
# Set 'state' as a tuple of relevant data for the agent
state = (inputs)
return state
def get_maxQ(self, state):
""" The get_max_Q function is called when the agent is asked to find the
maximum Q-value of all actions based on the 'state' the smartcab is in. """
###########
## TO DO ##
###########
# Calculate the maximum Q-value of all actions for a given state
value = self.Q[state]['left']
for action in self.Q[state]:
if value <= self.Q[state][action]:
value = self.Q[state][action]
return value
def createQ(self, state):
""" The createQ function is called when a state is generated by the agent. """
###########
## TO DO ##
###########
# When learning, check if the 'state' is not in the Q-table
# If it is not, create a new dictionary for that state
# Then, for each action available, set the initial Q-value to 0.0
if(self.learning):
if(state in self.Q):
return
self.Q[state] = dict()
for action in self.valid_actions:
self.Q[state][action] = 0.0
return
def choose_action(self, state):
""" The choose_action function is called when the agent is asked to choose
which action to take, based on the 'state' the smartcab is in. """
# Set the agent state and default action
self.state = state
#self.next_waypoint = self.planner.next_waypoint()
#DriveActions = ['left','right','forward',None]
action = None
###########
## TO DO ##
###########
# When not learning, choose a random action
# When learning, choose a random action with 'epsilon' probability
# Otherwise, choose an action with the highest Q-value for the current state
randomnum = random.uniform(0,1)
if(randomnum <= self.epsilon):
#for paction in self.valid_actions:
#if self.Q[state][paction] is 0.0:
# action = paction
#else:
#chosenaction = random.randint(0,3)
#action = self.valid_actions[chosenaction]
action = random.choice(self.valid_actions)
else:
maxQ = self.get_maxQ(state)
highqactions = []
posibilities = -1
for validact in self.valid_actions:
if self.Q[state][validact] < maxQ:
continue
highqactions = np.append(highqactions,validact)
posibilities += 1
chosenaction = random.randint(0,posibilities)
action = highqactions[chosenaction]
#print action
return action
def learn(self, state, action, reward):
""" The learn function is called after the agent completes an action and
receives an award. This function does not consider future rewards
when conducting learning. """
###########
## TO DO ##
###########
# When learning, implement the value iteration update rule
# Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
if not self.learning:
return
qvalue = self.Q[state][action] * (1-self.alpha)
reward = reward * self.alpha
self.Q[state][action] = reward + qvalue
return
def update(self):
""" The update function is called when a time step is completed in the
environment for a given trial. This function will build the agent
state, choose an action, receive a reward, and learn if enabled. """
state = self.build_state() # Get current state
self.createQ(state) # Create 'state' in Q-table
action = self.choose_action(state) # Choose an action
reward = self.env.act(action) # Receive a reward
self.learn(state, action, reward) # Q-learn
self.environmentmodel.Transition(action,self.env.sense()) #sense the environment and update model
return