-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmdp.py
executable file
·84 lines (63 loc) · 2.44 KB
/
mdp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import random
class MDP:
""" Return all states of this MDP """
def get_states(self):
abstract
""" Return all actions with non-zero probability from this state """
def get_actions(self, state):
abstract
""" Return all non-zero probability transitions for this action
from this state, as a list of (state, probability) pairs
"""
def get_transitions(self, state, action):
abstract
""" Return the reward for transitioning from state to
nextState via action
"""
def get_reward(self, state, action, next_state):
abstract
""" Return true if and only if state is a terminal state of this MDP """
def is_terminal(self, state):
abstract
""" Return the discount factor for this MDP """
def get_discount_factor(self):
abstract
""" Return the initial state of this MDP """
def get_initial_state(self):
abstract
""" Return all goal states of this MDP """
def get_goal_states(self):
abstract
""" Return a new state and a reward for executing action in state,
based on the underlying probability. This can be used for
model-free learning methods, but requires a model to operate.
Override for simulation-based learning
"""
def execute(self, state, action):
rand = random.random()
cumulative_probability = 0.0
for (new_state, probability) in self.get_transitions(state, action):
if cumulative_probability <= rand <= probability + cumulative_probability:
return (new_state, self.get_reward(state, action, new_state))
cumulative_probability += probability
if cumulative_probability >= 1.0:
raise (
"Cumulative probability >= 1.0 for action "
+ str(action)
+ " from "
+ str(state)
)
raise (
"No outcome state in simulation for action"
+ str(action)
+ " from "
+ str(state)
)
""" Execute a policy on this mdp for a number of episodes """
def execute_policy(self, policy, episodes=100):
for _ in range(episodes):
state = self.get_initial_state()
while not self.is_terminal(state):
action = policy.select_action(state)
(next_state, reward) = self.execute(state, action)
state = next_state