diff --git a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb index a12984da8..0e952a07c 100644 --- a/PolicyGradient/CliffWalk Actor Critic Solution.ipynb +++ b/PolicyGradient/CliffWalk Actor Critic Solution.ipynb @@ -139,16 +139,15 @@ "source": [ "def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", - " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", + " Actor Critic Algorithm. Optimizes the policy \n", + " function approximator using policy gradient.\n", " \n", " Args:\n", " env: OpenAI environment.\n", - " estimator: Action-Value function estimator\n", - " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", - " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", - " epsilon_decay: Each episode, epsilon is decayed by this factor\n", + " estimator_policy: Policy Function to be optimized \n", + " estimator_value: Value function approximator, used as a baseline\n", + " num_episodes: Number of episodes to run for\n", + " discount_factor: Time-discount factor\n", " \n", " Returns:\n", " An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.\n", @@ -307,7 +306,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.0" } }, "nbformat": 4, diff --git a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb index 637ea9758..4291d5551 100644 --- a/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb +++ b/PolicyGradient/CliffWalk REINFORCE with Baseline Solution.ipynb @@ -139,7 +139,7 @@ "source": [ "def reinforce(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):\n", " \"\"\"\n", - " REINFORCE (Monte Carlo Policy Gradient) Algorotihm. Optimizes the policy\n", + " REINFORCE (Monte Carlo Policy Gradient) Algorithm. Optimizes the policy\n", " function approximator using policy gradient.\n", " \n", " Args:\n", @@ -316,7 +316,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.0" } }, "nbformat": 4, diff --git a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb index 1bcd4e4db..a6ee821ce 100644 --- a/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb +++ b/PolicyGradient/Continuous MountainCar Actor Critic Solution.ipynb @@ -232,16 +232,15 @@ "source": [ "def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):\n", " \"\"\"\n", - " Q-Learning algorithm for fff-policy TD control using Function Approximation.\n", - " Finds the optimal greedy policy while following an epsilon-greedy policy.\n", + " Actor Critic Algorithm. Optimizes the policy \n", + " function approximator using policy gradient.\n", " \n", " Args:\n", " env: OpenAI environment.\n", - " estimator: Action-Value function estimator\n", - " num_episodes: Number of episodes to run for.\n", - " discount_factor: Lambda time discount factor.\n", - " epsilon: Chance the sample a random action. Float betwen 0 and 1.\n", - " epsilon_decay: Each episode, epsilon is decayed by this factor\n", + " estimator_policy: Policy Function to be optimized \n", + " estimator_value: Value function approximator, used as a baseline\n", + " num_episodes: Number of episodes to run for\n", + " discount_factor: Time-discount factor\n", " \n", " Returns:\n", " An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.\n", @@ -410,7 +409,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.0" } }, "nbformat": 4,