pomdpagent.py

import random
import numpy as np
import math
from Interfaces import Agent
#from planner import RoutePlanner

from PomdpenvironmentConverter import POMDPEnvironmentConverter
import copy


class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """ 

    def __init__(self, env, environment_type, learning=False, epsilon=1.0, alpha=0.5, resolution = 4, ):
        super(LearningAgent, self).__init__(env)     # Set the agent in the evironment 
        #self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        environment_copy = environment_type.copy(env)

        self.environmentmodel = POMDPEnvironmentConverter(environment_copy,resolution) #the converter needs a model of the environment to make a few calculations.
                                # Set parameters of the learning agent
        self.learning = learning # Whether the agent is expected to learn
        self.Q = dict()          # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor
        self.T = 1
        self.weight = 0.0005
        self.color = 'green'
        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed


    def reset(self,  testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        #self.planner.route_to(destination)
        
        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        #self.epsilon = math.cos(self.T*self.alpha)
        #self.epsilon = self.alpha**self.T
        
        
        #self.epsilon = self.epsilon - 0.05
        self.environmentmodel.reset()
        
        
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            #This is an adjusted sigmoid function
            threshold = 0.0004
            smallnum = 0.00001
            adjustment = (math.log(threshold)/(math.log(smallnum)*self.weight))

            self.epsilon = 1 / (1 + smallnum**(-self.weight*(self.T-adjustment)))
            #end sigmoid
            
            
        self.T = self.T + 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        
        # Collect data about the environment
        #waypoint = self.planner.next_waypoint() # The next waypoint 
        inputs = self.environmentmodel.getState()           # Visual input - intersection light and traffic
        #deadline = self.env.get_deadline(self)  # Remaining deadline
        #print inputs
        ########### 
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent        
        state = (inputs)

        return state


    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        value = self.Q[state]['left']
        for action in self.Q[state]:
            if value <= self.Q[state][action]:
                value = self.Q[state][action]

        return value


    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if(self.learning):
            if(state in self.Q):
                return
        
        self.Q[state] = dict()
        for action in self.valid_actions:
            self.Q[state][action] = 0.0
        
            
        return


    def choose_action(self, state):

        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        #self.next_waypoint = self.planner.next_waypoint()
        
        #DriveActions = ['left','right','forward',None]
        action = None
        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        randomnum = random.uniform(0,1)
        if(randomnum <= self.epsilon):
            #for paction in self.valid_actions:
                #if self.Q[state][paction] is 0.0:
                #    action = paction
                #else:
                    #chosenaction = random.randint(0,3)
                    #action = self.valid_actions[chosenaction]
            
            action = random.choice(self.valid_actions)
        else:
            maxQ = self.get_maxQ(state)
            highqactions = []
            posibilities = -1
            for validact in self.valid_actions:
                if self.Q[state][validact] < maxQ:
                    continue
                highqactions = np.append(highqactions,validact)
                posibilities += 1
            chosenaction = random.randint(0,posibilities)
            action = highqactions[chosenaction]
            
        #print action
            
            
        return action


    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if not self.learning:
            return
        
        qvalue = self.Q[state][action] * (1-self.alpha)
        reward = reward * self.alpha
        self.Q[state][action] = reward + qvalue
        return


    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()          # Get current state
        self.createQ(state)                 # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(action) # Receive a reward
        self.learn(state, action, reward)   # Q-learn
        self.environmentmodel.Transition(action,self.env.sense()) #sense the environment and update model
        return