Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create CUDA extension #259

Merged
merged 8 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
EvoTreesCUDAExt = "CUDA"

[compat]
BSON = "0.3"
CUDA = "3.0, 4.0, 5.0"
Expand All @@ -29,6 +35,7 @@ Tables = "1.9"
julia = "1.6"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Expand All @@ -37,4 +44,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
docs = ["Documenter"]
test = ["DataFrames", "Test", "MLJBase", "MLJTestInterface"]
test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"]
6 changes: 6 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ m = fit_evotree(config, dtrain; target_name="y", fnames=["x1", "x3"]);

### GPU Acceleration

EvoTrees supports training and inference on Nvidia GPU's with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
Note that on Julia ≥ 1.9 CUDA support is only enabled when CUDA.jl is installed and loaded, by another package or explicitly with e.g.
```julia
using CUDA
```

If running on a CUDA enabled machine, training and inference on GPU can be triggered through the `device` kwarg:

```julia
Expand Down
4 changes: 0 additions & 4 deletions docs/src/internals.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ EvoTrees.update_gains!
EvoTrees.predict!
EvoTrees.subsample
EvoTrees.split_set_chunk!
EvoTrees.split_chunk_kernel!
```

## Histogram
Expand All @@ -28,7 +27,4 @@ EvoTrees.split_chunk_kernel!
EvoTrees.get_edges
EvoTrees.binarize
EvoTrees.update_hist!
EvoTrees.hist_kernel!
EvoTrees.hist_kernel_vec!
EvoTrees.predict_kernel!
```
22 changes: 22 additions & 0 deletions ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module EvoTreesCUDAExt

using EvoTrees
using CUDA

# This should be different on CPUs and GPUs
EvoTrees.device_ones(::Type{<:EvoTrees.GPU}, ::Type{T}, n::Int) where {T} = CUDA.ones(T, n)
EvoTrees.device_array_type(::Type{<:EvoTrees.GPU}) = CuArray
function EvoTrees.post_fit_gc(::Type{<:EvoTrees.GPU})
GC.gc(true)
CUDA.reclaim()
end

include("loss.jl")
include("eval.jl")
include("predict.jl")
include("init.jl")
include("subsample.jl")
include("fit-utils.jl")
include("fit.jl")

end # module
22 changes: 11 additions & 11 deletions src/gpu/eval.jl → ext/EvoTreesCUDAExt/eval.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ function eval_mse_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
end
return nothing
end
function mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_mse_kernel!(eval, p, y, w)
Expand All @@ -19,8 +19,8 @@ end
########################
# RMSE
########################
rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
sqrt(rmse(p, y, w; MAX_THREADS, kwargs...))
EvoTrees.rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
sqrt(EvoTrees.rmse(p, y, w; MAX_THREADS, kwargs...))

########################
# MAE
Expand All @@ -32,7 +32,7 @@ function eval_mae_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
end
return nothing
end
function mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_mae_kernel!(eval, p, y, w)
Expand All @@ -51,7 +51,7 @@ function eval_logloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
end
return nothing
end
function logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_logloss_kernel!(eval, p, y, w)
Expand All @@ -70,7 +70,7 @@ function eval_gaussian_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
end
return nothing
end
function gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_gaussian_kernel!(eval, p, y, w)
Expand All @@ -91,7 +91,7 @@ function eval_poisson_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
return nothing
end

function poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_poisson_kernel!(eval, p, y, w)
Expand All @@ -111,7 +111,7 @@ function eval_gamma_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::Cu
return nothing
end

function gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_gamma_kernel!(eval, p, y, w)
Expand All @@ -133,7 +133,7 @@ function eval_tweedie_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
return nothing
end

function tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_tweedie_kernel!(eval, p, y, w)
Expand All @@ -158,10 +158,10 @@ function eval_mlogloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
return nothing
end

function mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_mlogloss_kernel!(eval, p, y, w)
CUDA.synchronize()
return sum(eval) / sum(w)
end
end
16 changes: 4 additions & 12 deletions src/gpu/fit-utils.jl → ext/EvoTreesCUDAExt/fit-utils.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
"""
hist_kernel!
"""
function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
bdx, bdy = blockDim().z, blockDim().y
Expand Down Expand Up @@ -48,9 +45,6 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
return nothing
end

"""
hist_kernel_vec!
"""
function hist_kernel_vec!(h∇, ∇, x_bin, is)
tix, k = threadIdx().x, threadIdx().y
bdx = blockDim().x
Expand Down Expand Up @@ -103,10 +97,8 @@ function update_hist_gpu_vec!(h, h∇, ∇, x_bin, is, js::Vector)
return nothing
end

"""
Multi-threads split_set!
Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
"""
# Multi-threads split_set!
# Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
function split_chunk_kernel!(
left::CuDeviceVector{S},
right::CuDeviceVector{S},
Expand Down Expand Up @@ -149,7 +141,7 @@ function split_chunk_kernel!(
return nothing
end

function split_views_kernel!(
function EvoTrees.split_views_kernel!(
out::CuDeviceVector{S},
left::CuDeviceVector{S},
right::CuDeviceVector{S},
Expand Down Expand Up @@ -208,7 +200,7 @@ function split_set_threads_gpu!(out, left, right, is, x_bin, feat, cond_bin, fea
sum_lefts = sum(lefts)
cumsum_lefts = cumsum(lefts)
cumsum_rights = cumsum(rights)
@cuda blocks = nblocks threads = 1 split_views_kernel!(
@cuda blocks = nblocks threads = 1 EvoTrees.split_views_kernel!(
out,
left,
right,
Expand Down
50 changes: 25 additions & 25 deletions src/gpu/fit.jl → ext/EvoTreesCUDAExt/fit.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type{GPU}) where {L,K}
function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}) where {L,K}

# compute gradients
update_grads!(cache.∇, cache.pred, cache.y, params)
EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, params)
# subsample rows
cache.nodes[1].is =
subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
# subsample cols
sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)
EvoTrees.sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)

# assign a root and grow tree
tree = Tree{L,K}(params.max_depth)
tree = EvoTrees.Tree{L,K}(params.max_depth)
grow! = params.tree_type == "oblivious" ? grow_otree! : grow_tree!
grow!(
tree,
Expand All @@ -27,16 +27,16 @@ function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type
cache.monotone_constraints,
)
push!(evotree.trees, tree)
predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
EvoTrees.predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
cache[:info][:nrounds] += 1
return nothing
end

jeremiedb marked this conversation as resolved.
Show resolved Hide resolved
# grow a single binary tree - grow through all depth
function grow_tree!(
tree::Tree{L,K},
tree::EvoTrees.Tree{L,K},
nodes::Vector{N},
params::EvoTypes{L},
params::EvoTrees.EvoTypes{L},
∇::CuMatrix,
edges,
js,
Expand Down Expand Up @@ -66,7 +66,7 @@ function grow_tree!(

# initialize summary stats
nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?

# grow while there are remaining active nodes
while length(n_current) > 0 && depth <= params.max_depth
Expand All @@ -90,14 +90,14 @@ function grow_tree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads for n ∈ sort(n_current)
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
Threads.@threads for n ∈ sort(n_current)
EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end
end

for n ∈ sort(n_current)
if depth == params.max_depth || nodes[n].∑[end] <= params.min_weight
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
else
best = findmax(findmax.(nodes[n].gains))
best_gain = best[1][1]
Expand Down Expand Up @@ -126,8 +126,8 @@ function grow_tree!(
nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)

if length(_right) >= length(_left)
push!(n_next, n << 1)
Expand All @@ -137,7 +137,7 @@ function grow_tree!(
push!(n_next, n << 1)
end
else
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
end
end
end
Expand All @@ -151,9 +151,9 @@ end

# grow a single oblivious tree - grow through all depth
function grow_otree!(
tree::Tree{L,K},
tree::EvoTrees.Tree{L,K},
nodes::Vector{N},
params::EvoTypes{L},
params::EvoTrees.EvoTypes{L},
∇::CuMatrix,
edges,
js,
Expand Down Expand Up @@ -183,7 +183,7 @@ function grow_otree!(

# initialize summary stats
nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?

# grow while there are remaining active nodes
while length(n_current) > 0 && depth <= params.max_depth
Expand All @@ -197,7 +197,7 @@ function grow_otree!(
if depth == params.max_depth || min_weight_flag
for n in n_current
# @info "length(nodes[n].is)" length(nodes[n].is) depth n
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
end
else
# update histograms
Expand All @@ -217,8 +217,8 @@ function grow_otree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads for n ∈ n_current
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
Threads.@threads for n ∈ n_current
EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end

# initialize gains for node 1 in which all gains of a given depth will be accumulated
Expand Down Expand Up @@ -273,8 +273,8 @@ function grow_otree!(
nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)

if length(_right) >= length(_left)
push!(n_next, n << 1)
Expand All @@ -286,7 +286,7 @@ function grow_otree!(
end
else
for n in n_current
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
end
end
end
Expand All @@ -295,4 +295,4 @@ function grow_otree!(
end # end of loop over current nodes for a given depth

return nothing
end
end
Loading