diff --git a/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.html b/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.html index e667857..728b8c6 100644 --- a/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.html +++ b/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.html @@ -14826,7 +14826,7 @@

Use Categorical DQN to Play Pongdef build_net(self, action_n, atom_count): net = keras.Sequential([ - keras.layers.Permute((2, 3, 1), input_shape=(4, 84, 84)), + layers.Permute((2, 3, 1), input_shape=(4, 84, 84)), layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu), layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu), layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu), diff --git a/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.ipynb b/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.ipynb index bf60567..3cc31be 100644 --- a/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.ipynb +++ b/en2023/code/PongNoFrameskip-v4_CategoricalDQN_tf.ipynb @@ -203,7 +203,7 @@ "\n", " def build_net(self, action_n, atom_count):\n", " net = keras.Sequential([\n", - " keras.layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),\n", + " layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),\n", " layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu),\n", " layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu),\n", " layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu),\n", diff --git a/en2023/code/PongNoFrameskip-v4_IQN_tf.html b/en2023/code/PongNoFrameskip-v4_IQN_tf.html index f24c74c..3cbbcc2 100644 --- a/en2023/code/PongNoFrameskip-v4_IQN_tf.html +++ b/en2023/code/PongNoFrameskip-v4_IQN_tf.html @@ -14812,7 +14812,7 @@

Use Implict Quantile Network super().__init__() self.cosine_count = cosine_count self.conv = keras.Sequential([ - keras.layers.Permute((2, 3, 1), input_shape=(4, 84, 84)), + layers.Permute((2, 3, 1), input_shape=(4, 84, 84)), layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu), layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu), layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu), diff --git a/en2023/code/PongNoFrameskip-v4_IQN_tf.ipynb b/en2023/code/PongNoFrameskip-v4_IQN_tf.ipynb index 0abd030..d0bb4f1 100644 --- a/en2023/code/PongNoFrameskip-v4_IQN_tf.ipynb +++ b/en2023/code/PongNoFrameskip-v4_IQN_tf.ipynb @@ -189,7 +189,7 @@ " super().__init__()\n", " self.cosine_count = cosine_count\n", " self.conv = keras.Sequential([\n", - " keras.layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),\n", + " layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),\n", " layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu),\n", " layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu),\n", " layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu),\n", diff --git a/en2023/code/PongNoFrameskip-v4_QRDQN_tf.html b/en2023/code/PongNoFrameskip-v4_QRDQN_tf.html index 06678c6..34919f7 100644 --- a/en2023/code/PongNoFrameskip-v4_QRDQN_tf.html +++ b/en2023/code/PongNoFrameskip-v4_QRDQN_tf.html @@ -14824,7 +14824,7 @@

Use QR-DQN to Play Pongdef build_net(self, action_n, quantile_count): net = keras.Sequential([ - keras.layers.Permute((2, 3, 1), input_shape=(4, 84, 84)), + layers.Permute((2, 3, 1), input_shape=(4, 84, 84)), layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu), layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu), layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu), diff --git a/en2023/code/PongNoFrameskip-v4_QRDQN_tf.ipynb b/en2023/code/PongNoFrameskip-v4_QRDQN_tf.ipynb index cc7f332..074e4f6 100644 --- a/en2023/code/PongNoFrameskip-v4_QRDQN_tf.ipynb +++ b/en2023/code/PongNoFrameskip-v4_QRDQN_tf.ipynb @@ -201,7 +201,7 @@ "\n", " def build_net(self, action_n, quantile_count):\n", " net = keras.Sequential([\n", - " keras.layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),\n", + " layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),\n", " layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu),\n", " layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu),\n", " layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu),\n", diff --git a/en2023/notation.html b/en2023/notation.html index cc52304..9f19dc1 100644 --- a/en2023/notation.html +++ b/en2023/notation.html @@ -741,6 +741,6 @@ Notations
-

Notation

General rules

  • Upper-case letters are random events or random numbers, while lower-case letters are deterministic events or deterministic variables.
  • The serif typeface, such as X, denotes numerical values. The sans typeface, such as X, denotes events in general, which can be either numerical or not numerical.
  • Bold letters denote vectors (such as w) or matrices (such as F), where matrices are always upper-case, even they are deterministic matrices.
  • Calligraph letters, such as X, denote sets.
  • Fraktur letters, such as 𝔣, denote mappings.

Table

In the sequel are notations throughout the book. We also occasionally follow other notations defined locally.

English LettersDescription
A, aadvantage
A, aaction
Aaction space
B, bbaseline in policy gradient; numerical belief in partially observable tasks; (lower case only) bonus; behavior policy in off-policy learning
B, bbelief in partially observable tasks
𝔅π, 𝔟πBellman expectation operator of policy π (upper case only used in distributional RL)
𝔅, 𝔟Bellman optimal operator (upper case only used in distributional RL)
Ba batch of transition generated by experience replay; belief space in partially observable tasks
B+belief space with terminal belief in partially observable tasks
ccounting; coefficients in linear programming
d, dmetrics
dff-divergence
dKLKL divergence
dJSJS divergence
dTVtotal variation
Dtindicator of episode end
Dset of experience
eeligibility trace
Eexpectation
𝔣a mapping
FFisher information matrix
G, greturn
ggradient vector
haction preference
Hentropy
kindex of iteration
loss
pprobability, dynamics
Ptransition matrix
oobservation probability in partially observable tasks; infinitesimal in asymptotic notations
O, O~infinite in asymptotic notations
O, oobservation
Prprobability
Q, qaction value
Qπ, qπaction value of policy π (upper case only used in distributional RL)
Q, qoptimal action values (upper case only used in distributional RL)
qvector representation of action values
R, rreward
Rreward space
S, sstate
Sstate space
S+state space with terminal state
Tsteps in an episode
𝔲belief update operator in partially observable tasks
U, uTD target; (lower case only) upper bound
V, vstate value
Vπ, vπstate value of the policy π (upper case only used in distributional RL)
V, voptimal state values (upper case only used in distributional RL)
vvector representation of state values
Varvariance
wparameters of value function estimate
X, xan event
Xevent space
zparameters for eligibility trace
Greek LettersDescription
αlearning rate
βreinforce strength in eligibility trace; distortion function in distributional RL
γdiscount factor
Δ, δTD error
εparameters for exploration
λdecay strength of eligibility trace
Π, πpolicy
πoptimal policy
πEexpert policy in imitation learning
θparameters for policy function estimates
ϑthreshold for value iteration
ρvisitation frequency; important sampling ratio in off-policy learning
ρvector representation of visitation frequency
τ, τsojourn time of SMDP
T, 𝞽trajectory
Ω, ωaccumulated probability in distribution RL; (lower case only) conditional probability for partially observable tasks
ΨGeneralized Advantage Estimate (GAE)
Other NotationsDescription
=dshare the same distribution
=a.e.equal almost everywhere
<, , , >compare numbers; element-wise comparison
, , , partial order of policy
absolute continuous
empty set
gradient
obey a distribution
||absolute value of a real number; element-wise absolute values of a vector or a matrix; the number of elements in a set
+

Notation

General rules

  • Upper-case letters are random events or random numbers, while lower-case letters are deterministic events or deterministic variables.
  • The serif typeface, such as X, denotes numerical values. The sans typeface, such as X, denotes events in general, which can be either numerical or not numerical.
  • Bold letters denote vectors (such as w) or matrices (such as F), where matrices are always upper-case, even they are deterministic matrices.
  • Calligraph letters, such as X, denote sets.
  • Fraktur letters, such as 𝔣, denote mappings.

Table

In the sequel are notations throughout the book. We also occasionally follow other notations defined locally.

English LettersDescription
A, aadvantage
A, aaction
Aaction space
B, bbaseline in policy gradient; numerical belief in partially observable tasks; (lower case only) bonus; behavior policy in off-policy learning
B, bbelief in partially observable tasks
𝔅π, 𝔟πBellman expectation operator of policy π (upper case only used in distributional RL)
𝔅, 𝔟Bellman optimal operator (upper case only used in distributional RL)
Ba batch of transition generated by experience replay; belief space in partially observable tasks
B+belief space with terminal belief in partially observable tasks
ccounting; coefficients in linear programming
d, dmetrics
dff-divergence
dKLKL divergence
dJSJS divergence
dTVtotal variation
Dtindicator of episode end
Dset of experience
eeligibility trace
Eexpectation
𝔣a mapping
FFisher information matrix
G, greturn
ggradient vector
haction preference
Hentropy
kindex of iteration
loss
pprobability, dynamics
Ptransition matrix
oobservation probability in partially observable tasks; infinitesimal in asymptotic notations
O, O~infinite in asymptotic notations
O, oobservation
Prprobability
Q, qaction value
Qπ, qπaction value of policy π (upper case only used in distributional RL)
Q, qoptimal action values (upper case only used in distributional RL)
qvector representation of action values
R, rreward
Rreward space
S, sstate
Sstate space
S+state space with terminal state
Tsteps in an episode
T, Ttrajectory
𝔲belief update operator in partially observable tasks
U, uTD target; (lower case only) upper bound
V, vstate value
Vπ, vπstate value of the policy π (upper case only used in distributional RL)
V, voptimal state values (upper case only used in distributional RL)
vvector representation of state values
Varvariance
wparameters of value function estimate
X, xan event
Xevent space
zparameters for eligibility trace
Greek LettersDescription
αlearning rate
βreinforce strength in eligibility trace; distortion function in distributional RL
γdiscount factor
Δ, δTD error
εparameters for exploration
λdecay strength of eligibility trace
Π, πpolicy
πoptimal policy
πEexpert policy in imitation learning
θparameters for policy function estimates
ϑthreshold for value iteration
ρvisitation frequency; important sampling ratio in off-policy learning
ρvector representation of visitation frequency
τ, τsojourn time of SMDP
Ω, ωaccumulated probability in distribution RL; (lower case only) conditional probability for partially observable tasks
ΨGeneralized Advantage Estimate (GAE)
Other NotationsDescription
=dshare the same distribution
=a.e.equal almost everywhere
<, , , >compare numbers; element-wise comparison
, , , partial order of policy
absolute continuous
empty set
gradient
obey a distribution
||absolute value of a real number; element-wise absolute values of a vector or a matrix; the number of elements in a set
\ No newline at end of file diff --git a/en2023/notation_zh.html b/en2023/notation_zh.html index 0e697f7..304407b 100644 --- a/en2023/notation_zh.html +++ b/en2023/notation_zh.html @@ -741,6 +741,6 @@ 《强化学习:原理与Python实现》数学记号
-

《强化学习:原理与Python实现》数学记号

一般规律

  • 大写是随机事件或随机变量,小写是确定性事件或确定性变量。
  • 衬线体(如Times New Roman字体,如X)是数值,非衬线体(如Open Sans字体,如X)则不一定是数值。
  • 粗体是向量(如w)或矩阵(如F)(矩阵用大写,即使是确定量也是如此)。
  • 花体(如X)是集合。
  • 哥特体(如 𝔣 )是映射。

数学记号表

下表列出常用记号。部分小节会有局部定义的记号,以该局部定义为准。

英语字母含义英文含义
A, a优势advantage
A, a动作action
A动作空间action space
B, b策略梯度算法中的基线;部分可观测任务中的数值化信念;(仅小写)额外量;异策学习时的行为策略baseline in policy gradient; numerical belief in partially observable tasks; (lower case only) bonus; behavior policy in off-policy learning
B, b部分可观测任务中的信念belief in partially observable tasks
𝔅π, 𝔟π策略π的Bellman期望算子(大写只用于值分布学习)Bellman expectation operator of policy π (upper case only used in distributional RL)
𝔅, 𝔟Bellman最优算子(大写只用于值分布学习)Bellman optimal operator (upper case only used in distributional RL)
B经验回放中抽取的一批经验;部分可观测任务中的信念空间a batch of transition generated by experience replay; belief space in partially observable tasks
B+部分可观测任务中带终止信念的信念空间belief space with terminal belief in partially observable tasks
c计数值;线性规划的目标系数counting; coefficients in linear programming
d, d度量metrics
dff散度f-divergence
dKLKL散度KL divergence
dJSJS散度JS divergence
dTV全变差total variation
Dt回合结束指示indicator of episode end
D经验集set of experience
e资格迹eligibility trace
E期望expectation
𝔣一般的映射a mapping
FFisher信息矩阵Fisher information matrix
G, g回报return
g梯度向量gradient vector
h动作偏好action preference
Hentropy
k迭代次数指标index of iteration
损失loss
p概率值,动力probability, dynamics
P转移矩阵transition matrix
o部分可观测环境的观测概率;渐近无穷小observation probability in partially observable tasks; infinitesimal in asymptotic notations
O, O~渐近无穷大infinite in asymptotic notations
O, o观测observation
Pr概率probability
Q, q动作价值action value
Qπ, qπ策略π的动作价值(大写只用于值分布学习)action value of policy π (upper case only used in distributional RL)
Q, q最优动作价值(大写只用于值分布学习)optimal action values (upper case only used in distributional RL)
q动作价值的向量表示vector representation of action values
R, r奖励reward
R奖励空间reward space
S, s状态state
S状态空间state space
S+带终止状态的状态空间state space with terminal state
T回合步数steps in an episode
𝔲部分可观测任务中的信念更新算子belief update operator in partially observable tasks
U, u用自益得到的回报估计随机变量;小写的u还表示置信上界TD target; (lower case only) upper bound
V, v状态价值state value
Vπ, vπ策略π的状态价值(大写只用于值分布学习)state value of the policy π (upper case only used in distributional RL)
V, v最优状态价值(大写只用于值分布学习)optimal state values (upper case only used in distributional RL)
v状态价值的向量表示vector representation of state values
Var方差variance
w价值估计参数parameters of value function estimate
X, x一般的事件an event
X一般的事件空间event space
z资格迹参数parameters for eligibility trace
希腊字母含义英文含义
α学习率learning rate
β资格迹算法中的强化强度;值分布学习中的扭曲函数reinforce strength in eligibility trace; distortion function in distributional RL
γ折扣因子discount factor
Δ, δ时序差分误差TD error
ε探索参数parameters for exploration
λ资格迹衰减强度decay strength of eligibility trace
Π, π策略policy
π最优策略optimal policy
πE模仿学习中的专家策略expert policy in imitation learning
θ策略估计参数parameters for policy function estimates
ϑ价值迭代终止阈值threshold for value iteration
ρ访问频次;异策算法中的重要性采样比率visitation frequency; important sampling ratio in off-policy learning
ρ访问频次的向量表示vector representation of visitation frequency
τ, τ半Markov决策过程中的逗留时间sojourn time of SMDP
T, 𝞽轨迹trajectory
Ω, ω值分布学习中的累积概率;(仅小写)部分可观测任务中的条件概率accumulated probability in distribution RL; (lower case only) conditional probability for partially observable tasks
Ψ扩展的优势估计Generalized Advantage Estimate (GAE)
其他符号含义英文含义
=d分布相同share the same distribution
=a.e.几乎处处相等equal almost everywhere
<, , , >普通数值比较;向量逐元素比较compare numbers; element-wise comparison
, , , 策略的偏序关系partial order of policy
绝对连续absolute continuous
空集empty set
梯度gradient
服从分布obey a distribution
||实数的绝对值;向量或矩阵的逐元素求绝对值;集合的元素个数absolute value of a real number; element-wise absolute values of a vector or a matrix; the number of elements in a set

 

+

《强化学习:原理与Python实现》数学记号

一般规律

  • 大写是随机事件或随机变量,小写是确定性事件或确定性变量。
  • 衬线体(如Times New Roman字体,如X)是数值,非衬线体(如Open Sans字体,如X)则不一定是数值。
  • 粗体是向量(如w)或矩阵(如F)(矩阵用大写,即使是确定量也是如此)。
  • 花体(如X)是集合。
  • 哥特体(如 𝔣 )是映射。

数学记号表

下表列出常用记号。部分小节会有局部定义的记号,以该局部定义为准。

英语字母含义英文含义
A, a优势advantage
A, a动作action
A动作空间action space
B, b策略梯度算法中的基线;部分可观测任务中的数值化信念;(仅小写)额外量;异策学习时的行为策略baseline in policy gradient; numerical belief in partially observable tasks; (lower case only) bonus; behavior policy in off-policy learning
B, b部分可观测任务中的信念belief in partially observable tasks
𝔅π, 𝔟π策略π的Bellman期望算子(大写只用于值分布学习)Bellman expectation operator of policy π (upper case only used in distributional RL)
𝔅, 𝔟Bellman最优算子(大写只用于值分布学习)Bellman optimal operator (upper case only used in distributional RL)
B经验回放中抽取的一批经验;部分可观测任务中的信念空间a batch of transition generated by experience replay; belief space in partially observable tasks
B+部分可观测任务中带终止信念的信念空间belief space with terminal belief in partially observable tasks
c计数值;线性规划的目标系数counting; coefficients in linear programming
d, d度量metrics
dff散度f-divergence
dKLKL散度KL divergence
dJSJS散度JS divergence
dTV全变差total variation
Dt回合结束指示indicator of episode end
D经验集set of experience
e资格迹eligibility trace
E期望expectation
𝔣一般的映射a mapping
FFisher信息矩阵Fisher information matrix
G, g回报return
g梯度向量gradient vector
h动作偏好action preference
Hentropy
k迭代次数指标index of iteration
损失loss
p概率值,动力probability, dynamics
P转移矩阵transition matrix
o部分可观测环境的观测概率;渐近无穷小observation probability in partially observable tasks; infinitesimal in asymptotic notations
O, O~渐近无穷大infinite in asymptotic notations
O, o观测observation
Pr概率probability
Q, q动作价值action value
Qπ, qπ策略π的动作价值(大写只用于值分布学习)action value of policy π (upper case only used in distributional RL)
Q, q最优动作价值(大写只用于值分布学习)optimal action values (upper case only used in distributional RL)
q动作价值的向量表示vector representation of action values
R, r奖励reward
R奖励空间reward space
S, s状态state
S状态空间state space
S+带终止状态的状态空间state space with terminal state
T回合步数steps in an episode
T, T轨迹trajectory
𝔲部分可观测任务中的信念更新算子belief update operator in partially observable tasks
U, u用自益得到的回报估计随机变量;小写的u还表示置信上界TD target; (lower case only) upper bound
V, v状态价值state value
Vπ, vπ策略π的状态价值(大写只用于值分布学习)state value of the policy π (upper case only used in distributional RL)
V, v最优状态价值(大写只用于值分布学习)optimal state values (upper case only used in distributional RL)
v状态价值的向量表示vector representation of state values
Var方差variance
w价值估计参数parameters of value function estimate
X, x一般的事件an event
X一般的事件空间event space
z资格迹参数parameters for eligibility trace
希腊字母含义英文含义
α学习率learning rate
β资格迹算法中的强化强度;值分布学习中的扭曲函数reinforce strength in eligibility trace; distortion function in distributional RL
γ折扣因子discount factor
Δ, δ时序差分误差TD error
ε探索参数parameters for exploration
λ资格迹衰减强度decay strength of eligibility trace
Π, π策略policy
π最优策略optimal policy
πE模仿学习中的专家策略expert policy in imitation learning
θ策略估计参数parameters for policy function estimates
ϑ价值迭代终止阈值threshold for value iteration
ρ访问频次;异策算法中的重要性采样比率visitation frequency; important sampling ratio in off-policy learning
ρ访问频次的向量表示vector representation of visitation frequency
τ, τ半Markov决策过程中的逗留时间sojourn time of SMDP
Ω, ω值分布学习中的累积概率;(仅小写)部分可观测任务中的条件概率accumulated probability in distribution RL; (lower case only) conditional probability for partially observable tasks
Ψ扩展的优势估计Generalized Advantage Estimate (GAE)
其他符号含义英文含义
=d分布相同share the same distribution
=a.e.几乎处处相等equal almost everywhere
<, , , >普通数值比较;向量逐元素比较compare numbers; element-wise comparison
, , , 策略的偏序关系partial order of policy
绝对连续absolute continuous
空集empty set
梯度gradient
服从分布obey a distribution
||实数的绝对值;向量或矩阵的逐元素求绝对值;集合的元素个数absolute value of a real number; element-wise absolute values of a vector or a matrix; the number of elements in a set

 

\ No newline at end of file diff --git a/zh2023/errata/202307.md b/zh2023/errata/202307.md index 079dfc9..f651e05 100644 --- a/zh2023/errata/202307.md +++ b/zh2023/errata/202307.md @@ -32,22 +32,31 @@ $\sum\limits_{t=0}^{+\infty}$ ## 第42页第3-4行 -$p_\ast\left(\mathsfit{s'},\mathsfit{a'}|\mathsfit{s},\mathsfit{a}\right)=\sum\limits_{\mathsfit{a'}}{\pi_\ast\left(\mathsfit{a'}\mid\mathsfit{s'} \right)\sum\limits_\mathsfit{a}{p\left(\mathsfit{s'}\mid\mathsfit{s},\mathsfit{a}\right)}}$, +$p_\ast\left(\mathsfit{s'},\mathsfit{a'}\middle\vert\mathsfit{s},\mathsfit{a}\right)=\sum\limits_{\mathsfit{a'}}{\pi_\ast\left(\mathsfit{a'}\middle\vert\mathsfit{s'} \right)\sum\limits_\mathsfit{a}{p\left(\mathsfit{s'}\middle\vert\mathsfit{s},\mathsfit{a}\right)}}$, $\mathsfit{s}\in\mathcal{S},\mathsfit{a}\in\mathcal{A}\left(\mathsfit{s}\right),\mathsfit{s'}\in\mathcal{S},\mathsfit{a}\in\mathcal{A}\left(\mathsfit{s'}\right)$ #### 改为 -$p_\ast\left({\mathsfit{s'},\mathsfit{a'}|\mathsfit{s},\mathsfit{a}}\right)=\pi_\ast\left(\mathsfit{a'}\mid\mathsfit{s'}\right)p\left( \mathsfit{s'}\mid\mathsfit{s},\mathsfit{a}\right),\quad\mathsfit{s}\in\mathcal{S},\mathsfit{a}\in\mathcal{A}\left(\mathsfit{s}\right),\mathsfit{s'}\in\mathcal{S},\mathsfit{a'}\in\mathcal{A}\left(\mathsfit{s'}\right)$ +$p_\ast\left({\mathsfit{s'},\mathsfit{a'}|\mathsfit{s},\mathsfit{a}}\right)=\pi_\ast\left(\mathsfit{a'}\middle\vert\mathsfit{s'}\right)p\left( \mathsfit{s'}\mid\mathsfit{s},\mathsfit{a}\right),\quad\mathsfit{s}\in\mathcal{S},\mathsfit{a}\in\mathcal{A}\left(\mathsfit{s}\right),\mathsfit{s'}\in\mathcal{S},\mathsfit{a'}\in\mathcal{A}\left(\mathsfit{s'}\right)$ + + +## 第80页倒数第10行 + +$\alpha _k\mathrm{E}\left[\left|F{\left(X_ {k-1}\right)}^2\right|\middle\vert{X}_ {k-1}\right]$ + +#### 改为 + +$\alpha _k\mathrm{E}\left[\left|F\left(X_ {k-1}\right)\right|^2\middle\vert{X}_ {k-1}\right]$ ## 第117页最后一个通栏数学表达式 -$\rho_{t+1:t+n-1}=\frac{\Pr_\pi\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t\right]}{\Pr_b\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t\right]}=\prod\limits_{\tau=t+1}^{t+n-1}{\frac{\pi\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}{b\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}}$ +$\rho_{t+1:t+n-1}=\frac{\Pr_\pi\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\middle\vert\mathsfit{S}_t\right]}{\Pr_b\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\middle\vert\mathsfit{S}_t\right]}=\prod\limits_{\tau=t+1}^{t+n-1}{\frac{\pi\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}{b\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}}$ #### 改为 -$\rho_{t+1:t+n-1}=\frac{\Pr_\pi\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t,\mathsfit{A}_t\right]}{\Pr_b\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t,\mathsfit{A}_t\right]}=\prod\limits_{\tau=t+1}^{t+n-1}{\frac{\pi\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}{b\left(\mathsfit{A}_\tau\mid\mathsfit{S}_\tau\right)}}$ +$\rho_{t+1:t+n-1}=\frac{\Pr_\pi\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t,\mathsfit{A}_t\right]}{\Pr_b\left[R_{t+1},\mathsfit{S}_{t+1},\mathsfit{A}_{t+1},\ldots,\mathsfit{S}_{t+n}\mid\mathsfit{S}_t,\mathsfit{A}_t\right]}=\prod\limits_{\tau=t+1}^{t+n-1}{\frac{\pi\left(\mathsfit{A}_\tau\middle\vert\mathsfit{S}_\tau\right)}{b\left(\mathsfit{A}_\tau\middle\vert\mathsfit{S}_\tau\right)}}$ ## 第177页最后一行 @@ -59,6 +68,23 @@ $\gamma^2\mathrm{E}_{\pi\left(\boldsymbol\theta\right)}\left[\nabla{v_{\pi\left( $\gamma^2\mathrm{E}_{\pi\left(\boldsymbol\theta\right)}\left[\nabla{v_{\pi\left(\boldsymbol\theta\right)}}\left(\mathsfit{S}_2\right)\right]$ +## 第279页第0行 + +$\gamma\sum\limits_\mathsfit{s'}{p_{\pi\left(\boldsymbol\theta\right)}\left(\mathsfit{s'}\middle\vert\mathsfit{s}\right)\nabla v_{\pi\left(\boldsymbol\theta\right)}^\left(\mathrm{H}\right)\left(\mathsfit{s}\right)}$ + +### 改为 + +$\gamma\sum\limits_\mathsfit{s'}{p_{\pi\left(\boldsymbol\theta\right)}\left(\mathsfit{s'}\middle\vert\mathsfit{s}\right)\nabla v_{\pi\left(\boldsymbol\theta\right)}^\left(\mathrm{H}\right)\left(\mathsfit{s'}\right)}$ + + +## 第279页第2~3行和第6行(共2处) + +$\mathrm{E}_ {\pi\left(\boldsymbol\theta\right)}\left[\sum\limits_ \mathsfit{a}q_{\pi\left(\boldsymbol\theta\right)}^\left(\mathrm{H}\right)\left(\mathsfit{S}_ t,\mathsfit{a}\right)\nabla\pi\left(\mathsfit{a}\middle\vert{\mathsfit{S}_ t};\boldsymbol\theta\right)\right]+\nabla\left(\alpha^\left(\mathrm{H}\right)\mathrm{H}\left[\pi\left(\cdot\middle\vert\mathsfit{S}_ t;\boldsymbol\theta\right)\right]\right)$ + +### 改为 + +$\mathrm{E}_ {\pi\left(\boldsymbol\theta\right)}\left[\sum\limits_ \mathsfit{a}q_{\pi\left(\boldsymbol\theta\right)}^\left(\mathrm{H}\right)\left(\mathsfit{S}_ t,\mathsfit{a}\right)\nabla\pi\left(\mathsfit{a}\middle\vert{\mathsfit{S}_ t};\boldsymbol\theta\right)+\nabla\left(\alpha^\left(\mathrm{H}\right)\mathrm{H}\left[\pi\left(\cdot\middle\vert\mathsfit{S}_ t;\boldsymbol\theta\right)\right]\right)\right]$ + ## 第288页代码10-2 ```python