diff --git a/README.md b/README.md index cc66866..b802296 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,8 @@ All codes have been saved as a .ipynb file and a .html file in the same director | 10 | [LunarLanderContinuous-v2](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_ClosedForm.html) | SACwA [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_SACwA_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_SACwA_torch.html) | | 11 | [BipedalWalker-v3](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ClosedForm.html) | [ES](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ES.html), [ARS](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ARS.html) | | 12 | [PongNoFrameskip-v4](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_ClosedForm.html) | CategoricalDQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_CategoricalDQN_torch.html), QR-DQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_QRDQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_QRDQN_torch.html), IQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_IQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_IQN_torch.html) | -| 13 | [BernoulliMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | -| 13 | [GaussianMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | +| 13 | [BernoulliMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv-v0_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv-v0_demo.html) | +| 13 | [GaussianMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | | 14 | [TicTacToe-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_ExhaustiveSearch.html) | AlphaZero [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_AlphaZero_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_AlphaZero_torch.html) | | 15 note | [HumanoidBulletEnv-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_ClosedForm_demo.html) | BehaviorClone [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_BC_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_BC_torch.html), GAIL [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_GAILPPO_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_GAILPPO_torch.html) | | 16 | [Tiger-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/Tiger-v0_ClosedForm.html) | [VI](https://zhiqingxiao.github.io/rl-book/en2023/code/Tiger-v0_Plan_demo.html) diff --git a/en2023/README.md b/en2023/README.md index 7f7f3e2..15567ad 100644 --- a/en2023/README.md +++ b/en2023/README.md @@ -66,8 +66,8 @@ List view: [link](https://github.com/zhiqingxiao/rl-book/blob/master/en2023/code | 10 | [LunarLanderContinuous-v2](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_ClosedForm.html) | SACwA [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_SACwA_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_SACwA_torch.html) | | 11 | [BipedalWalker-v3](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ClosedForm.html) | [ES](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ES.html), [ARS](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ARS.html) | | 12 | [PongNoFrameskip-v4](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_ClosedForm.html) | CategoricalDQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_CategoricalDQN_torch.html), QR-DQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_QRDQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_QRDQN_torch.html), IQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_IQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_IQN_torch.html) | -| 13 | [BernoulliMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | -| 13 | [GaussianMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | +| 13 | [BernoulliMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv-v0_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv-v0_demo.html) | +| 13 | [GaussianMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | | 14 | [TicTacToe-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_ExhaustiveSearch.html) | AlphaZero [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_AlphaZero_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_AlphaZero_torch.html) | | 15 note | [HumanoidBulletEnv-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_ClosedForm_demo.html) | BehaviorClone [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_BC_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_BC_torch.html), GAIL [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_GAILPPO_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_GAILPPO_torch.html) | | 16 | [Tiger-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/Tiger-v0_ClosedForm.html) | [VI](https://zhiqingxiao.github.io/rl-book/en2023/code/Tiger-v0_Plan_demo.html) diff --git a/en2023/code/BernoulliMABEnv_demo.html b/en2023/code/BernoulliMABEnv-v0_demo.html similarity index 100% rename from en2023/code/BernoulliMABEnv_demo.html rename to en2023/code/BernoulliMABEnv-v0_demo.html diff --git a/en2023/code/BernoulliMABEnv_demo.ipynb b/en2023/code/BernoulliMABEnv-v0_demo.ipynb similarity index 100% rename from en2023/code/BernoulliMABEnv_demo.ipynb rename to en2023/code/BernoulliMABEnv-v0_demo.ipynb diff --git a/zh2023/README.md b/zh2023/README.md index 1972940..7a4ca21 100644 --- a/zh2023/README.md +++ b/zh2023/README.md @@ -32,8 +32,8 @@ | 10 | [LunarLanderContinuous-v2](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_ClosedForm.html) | SACwA [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_SACwA_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/LunarLanderContinuous-v2_SACwA_torch.html) | | 11 | [BipedalWalker-v3](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ClosedForm.html) | [ES](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ES.html), [ARS](https://zhiqingxiao.github.io/rl-book/en2023/code/BipedalWalker-v3_ARS.html) | | 12 | [PongNoFrameskip-v4](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_ClosedForm.html) | CategoricalDQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_CategoricalDQN_torch.html), QR-DQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_QRDQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_QRDQN_torch.html), IQN [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_IQN_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/PongNoFrameskip-v4_IQN_torch.html) | -| 13 | [BernoulliMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | -| 13 | [GaussianMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | +| 13 | [BernoulliMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv-v0_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/BernoulliMABEnv-v0_demo.html) | +| 13 | [GaussianMAB-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | [UCB](https://zhiqingxiao.github.io/rl-book/en2023/code/GaussianMABEnv_demo.html) | | 14 | [TicTacToe-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_ExhaustiveSearch.html) | AlphaZero [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_AlphaZero_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/TicTacToe-v0_AlphaZero_torch.html) | | 15 注 | [HumanoidBulletEnv-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_ClosedForm_demo.html) | BehaviorClone [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_BC_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_BC_torch.html), GAIL [tf](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_GAILPPO_tf.html) [torch](https://zhiqingxiao.github.io/rl-book/en2023/code/HumanoidBulletEnv-v0_GAILPPO_torch.html) | | 16 | [Tiger-v0](https://zhiqingxiao.github.io/rl-book/en2023/code/Tiger-v0_ClosedForm.html) | [VI](https://zhiqingxiao.github.io/rl-book/en2023/code/Tiger-v0_Plan_demo.html) diff --git a/zh2023/errata/202307.md b/zh2023/errata/202307.md index 737c024..079dfc9 100644 --- a/zh2023/errata/202307.md +++ b/zh2023/errata/202307.md @@ -50,7 +50,27 @@ $\rho_{t+1:t+n-1}=\frac{\Pr_\pi\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+ $\rho_{t+1:t+n-1}=\frac{\Pr_\pi\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t,\mathsfit{A}_t\right]}{\Pr_b\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t,\mathsfit{A}_t\right]}=\prod\limits_{\tau=t+1}^{t+n-1}{\frac{\pi\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}{b\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}}$ -## 第288页代码10-2里的`step()`函数 +## 第177页最后一行 + +$\gamma^2\mathrm{E}_{\pi\left(\boldsymbol\theta\right)}\left[\nabla{v_{\pi\left(\boldsymbol\theta\right)}}\left(\mathsfit{S}_1\right)\right]$ + +#### 改为 + +$\gamma^2\mathrm{E}_{\pi\left(\boldsymbol\theta\right)}\left[\nabla{v_{\pi\left(\boldsymbol\theta\right)}}\left(\mathsfit{S}_2\right)\right]$ + + +## 第288页代码10-2 + +```python + def step(self, observation, reward, terminated): + position, velocity = observation + if position > -4 * velocity or position < 13 * velocity - 0.6: + force = 1. + else: + force = -1. + action = np.array([force,]) + return action +``` #### 改为 @@ -59,8 +79,7 @@ $\rho_{t+1:t+n-1}=\frac{\Pr_\pi\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+ x, y, v_x, v_y, angle, v_angle, contact_left, \ contact_right = observation - if contact_left or contact_right: - # legs have contact + if contact_left or contact_right: # 腿接触了 f_y = -10. * v_y - 1. f_angle = 0. else: