mrp

RiskAverseRL · Apr 2, 2024 · 0835ecb · 0835ecb
1 parent 2fc119e
commit 0835ecb
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 18 deletions.
diff --git a/docs/src/recipes.md b/docs/src/recipes.md
@@ -37,4 +37,28 @@ CSV.write(fileout, output)
 # output
 
 1
-```	   
+```
+
+## Making a small MDP
+
+```jldoctest
+using MDPs
+
+ε = 0.01
+# transition probabilities
+P1 = [1 0 0; 0   1 0; 0 0 1]
+P2 = [0 1 0; 1-ε 0 ε; 0 0 1]
+Ps = [P1, P2]
+
+# rewards
+R = [10 -4 0; -1 -3 0; 0 0 0] # use the same reward for both actions
+Rs = [R, R]
+
+M = make_int_mdp(Ps, Rs)
+
+state_count(M)
+
+# output 
+
+3
+```
diff --git a/src/algorithms/mrp.jl b/src/algorithms/mrp.jl
@@ -12,10 +12,11 @@ MDP `model` and policy `π`. Also supports terminal states.
 Does not support duplicate entries in transition probabilities.
 """
 function mrp!(P_π::AbstractMatrix{<:Real}, r_π::AbstractVector{<:Real},
-              model::TabMDP, π::AbstractVector{Int})
+              model::TabMDP, π::AbstractVector{<:Integer})
     S = state_count(model)
     fill!(P_π, 0.); fill!(r_π, 0.)
     for s ∈ 1:S
+        #TODO: remove the definition of terminal states
         if !isterminal(model, s)
             for (sn, p, r) ∈ transition(model, s, π[s])
                 P_π[s,sn] ≈ 0. ||
@@ -35,10 +36,10 @@ end
 Compute the transition matrix `P_π` and reward vector `r_π` for the 
 MDP `model` and policy `π`. See mrp! for more details. 
 """
-function mrp(model::TabMDP, π::AbstractVector{Int})
+function mrp(model::TabMDP, π::AbstractVector{<:Integer})
     S = state_count(model)
     P_π = Matrix{Float64}(undef,S,S)
-    r_π = Vector(undef, S)
+    r_π = Vector{Float64}(undef, S)
     mrp!(P_π, r_π, model, π)
     (P_π, r_π)    
 end

diff --git a/src/domains/gambler.jl b/src/domains/gambler.jl
@@ -6,12 +6,14 @@ mt(st, prob,rew) =
     (Int(st), Float64(prob), Float64(rew))::Tuple{Int, Float64, Float64}
 
 """
+    Ruin(win, max_capital)
+
 Gambler's ruin. Can decide how much to bet at any point in time. With some
-probability `p`, the bet is doubled, and with `1-p` it is lost. The
+probability `win`, the bet is doubled, and with `1-win` it is lost. The
 reward is 1 if it achieves some terminal capital and 0 otherwise.
 
-Capital = state - 1
-Bet = action - 1 
+- Capital = state - 1
+- Bet = action - 1 
 
 Available actions are 1, ..., state - 1.
 

diff --git a/src/models/integral.jl b/src/models/integral.jl
@@ -197,29 +197,32 @@ function load_mdp(input; idoutcome = nothing, docompress = false)
     IntMDP(states)
 end
 
+
+_make_reward(r::Vector{<:Number}, s, n) = repeat([r[s]], n) 
+_make_reward(R::Matrix{<:Number}, s, n) = R[s,:]
+
+
 """
     make_int_mdp(Ps, rs)
 
 Build IntMDP from a list of transition probabilities `Ps` and reward vectors
-`rs` for each action in the MDP. Each row of the transition matrix represents
-the probabilities of transitioning to next states.
+`rs` for each action in the MDP. If `rs` are vectors, then they are assumed
+to be state action rewards. If `rs` are matrixes then they are assumed to be
+state-action-state rewwards. Each row of the transition matrix (and the reward
+matrix) represents the probabilities of transitioning to next states.
 """
-function make_int_mdp(Ps::AbstractVector{Matrix{X}},
-                          rs::AbstractVector{Vector{Y}}) where {X <: Number, Y <: Number}
+function make_int_mdp(Ps::AbstractVector{<:Matrix}, rs::AbstractVector{<:Array})
 
     isempty(Ps) && error("Must have at least one action.")
-    length(Ps) == length(rs) || error("Dimensions must match.")
+    length(Ps) == length(rs) || error("Ps and rs lengths must match.")
 
     statecount = size(Ps[1])[1]
-    actioncount = length(Ps)
 
     states = Vector{IntState}(undef, statecount)
     for s ∈ 1:statecount
-        actions = Vector{IntAction}(undef, actioncount)
-        for a ∈ 1:actioncount
-            actions[a] = IntAction(1:statecount, Ps[a][s,:],
-                                       repeat([rs[a][s]], statecount) )
-        end
+        actions = [
+            IntAction(1:statecount, Ps[a][s,:], _make_reward(rs[a], s, statecount))
+            for a ∈ eachindex(Ps,rs)]
         states[s] = IntState(actions)
     end
     IntMDP(states)