Evovest · jeremiedb · Oct 10, 2023 · Oct 5, 2023 · Oct 5, 2023 · Oct 5, 2023
diff --git a/Project.toml b/Project.toml
@@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+EvoTreesCUDAExt = "CUDA"
+
 [compat]
 BSON = "0.3"
 CUDA = "3.0, 4.0, 5.0"
@@ -29,6 +35,7 @@ Tables = "1.9"
 julia = "1.6"
 
 [extras]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
@@ -37,4 +44,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 docs = ["Documenter"]
-test = ["DataFrames", "Test", "MLJBase", "MLJTestInterface"]
+test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"]
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -68,6 +68,12 @@ m = fit_evotree(config, dtrain; target_name="y", fnames=["x1", "x3"]);
 
 ### GPU Acceleration
 
+EvoTrees supports training and inference on Nvidia GPU's with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
+Note that on Julia ≥ 1.9 CUDA support is only enabled when CUDA.jl is installed and loaded, by another package or explicitly with e.g.
+```julia
+using CUDA
+```
+
 If running on a CUDA enabled machine, training and inference on GPU can be triggered through the `device` kwarg: 
 
 ```julia

diff --git a/docs/src/internals.md b/docs/src/internals.md
@@ -19,7 +19,6 @@ EvoTrees.update_gains!
 EvoTrees.predict!
 EvoTrees.subsample
 EvoTrees.split_set_chunk!
-EvoTrees.split_chunk_kernel!
 ```
 
 ## Histogram
@@ -28,7 +27,4 @@ EvoTrees.split_chunk_kernel!
 EvoTrees.get_edges
 EvoTrees.binarize
 EvoTrees.update_hist!
-EvoTrees.hist_kernel!
-EvoTrees.hist_kernel_vec!
-EvoTrees.predict_kernel!
 ```
diff --git a/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl b/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl
@@ -0,0 +1,22 @@
+module EvoTreesCUDAExt
+
+using EvoTrees
+using CUDA
+
+# This should be different on CPUs and GPUs
+EvoTrees.device_ones(::Type{<:EvoTrees.GPU}, ::Type{T}, n::Int) where {T} = CUDA.ones(T, n)
+EvoTrees.device_array_type(::Type{<:EvoTrees.GPU}) = CuArray
+function EvoTrees.post_fit_gc(::Type{<:EvoTrees.GPU})
+    GC.gc(true)
+    CUDA.reclaim()
+end
+
+include("loss.jl")
+include("eval.jl")
+include("predict.jl")
+include("init.jl")
+include("subsample.jl")
+include("fit-utils.jl")
+include("fit.jl")
+
+end # module
diff --git a/src/gpu/eval.jl → ext/EvoTreesCUDAExt/eval.jl b/src/gpu/eval.jl → ext/EvoTreesCUDAExt/eval.jl
@@ -8,7 +8,7 @@ function eval_mse_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
     end
     return nothing
 end
-function mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_mse_kernel!(eval, p, y, w)
@@ -19,8 +19,8 @@ end
 ########################
 # RMSE
 ########################
-rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
-    sqrt(rmse(p, y, w; MAX_THREADS, kwargs...))
+EvoTrees.rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
+    sqrt(EvoTrees.rmse(p, y, w; MAX_THREADS, kwargs...))
 
 ########################
 # MAE
@@ -32,7 +32,7 @@ function eval_mae_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
     end
     return nothing
 end
-function mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_mae_kernel!(eval, p, y, w)
@@ -51,7 +51,7 @@ function eval_logloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
     end
     return nothing
 end
-function logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_logloss_kernel!(eval, p, y, w)
@@ -70,7 +70,7 @@ function eval_gaussian_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
     end
     return nothing
 end
-function gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_gaussian_kernel!(eval, p, y, w)
@@ -91,7 +91,7 @@ function eval_poisson_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
     return nothing
 end
 
-function poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_poisson_kernel!(eval, p, y, w)
@@ -111,7 +111,7 @@ function eval_gamma_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::Cu
     return nothing
 end
 
-function gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_gamma_kernel!(eval, p, y, w)
@@ -133,7 +133,7 @@ function eval_tweedie_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
     return nothing
 end
 
-function tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_tweedie_kernel!(eval, p, y, w)
@@ -158,10 +158,10 @@ function eval_mlogloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
     return nothing
 end
 
-function mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_mlogloss_kernel!(eval, p, y, w)
     CUDA.synchronize()
     return sum(eval) / sum(w)
-end
+end
diff --git a/src/gpu/fit-utils.jl → ext/EvoTreesCUDAExt/fit-utils.jl b/src/gpu/fit-utils.jl → ext/EvoTreesCUDAExt/fit-utils.jl
@@ -1,6 +1,3 @@
-"""
-    hist_kernel!
-"""
 function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
     tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
     bdx, bdy = blockDim().z, blockDim().y
@@ -48,9 +45,6 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
     return nothing
 end
 
-"""
-    hist_kernel_vec!
-"""
 function hist_kernel_vec!(h∇, ∇, x_bin, is)
     tix, k = threadIdx().x, threadIdx().y
     bdx = blockDim().x
@@ -103,10 +97,8 @@ function update_hist_gpu_vec!(h, h∇, ∇, x_bin, is, js::Vector)
     return nothing
 end
 
-"""
-    Multi-threads split_set!
-        Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
-"""
+# Multi-threads split_set!
+# Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
 function split_chunk_kernel!(
     left::CuDeviceVector{S},
     right::CuDeviceVector{S},
@@ -149,7 +141,7 @@ function split_chunk_kernel!(
     return nothing
 end
 
-function split_views_kernel!(
+function EvoTrees.split_views_kernel!(
     out::CuDeviceVector{S},
     left::CuDeviceVector{S},
     right::CuDeviceVector{S},
@@ -208,7 +200,7 @@ function split_set_threads_gpu!(out, left, right, is, x_bin, feat, cond_bin, fea
     sum_lefts = sum(lefts)
     cumsum_lefts = cumsum(lefts)
     cumsum_rights = cumsum(rights)
-    @cuda blocks = nblocks threads = 1 split_views_kernel!(
+    @cuda blocks = nblocks threads = 1 EvoTrees.split_views_kernel!(
         out,
         left,
         right,

diff --git a/src/gpu/fit.jl → ext/EvoTreesCUDAExt/fit.jl b/src/gpu/fit.jl → ext/EvoTreesCUDAExt/fit.jl
@@ -1,15 +1,15 @@
-function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type{GPU}) where {L,K}
+function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}) where {L,K}
 
     # compute gradients
-    update_grads!(cache.∇, cache.pred, cache.y, params)
+    EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, params)
     # subsample rows
     cache.nodes[1].is =
-        subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
+        EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
     # subsample cols
-    sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)
+    EvoTrees.sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)
 
     # assign a root and grow tree
-    tree = Tree{L,K}(params.max_depth)
+    tree = EvoTrees.Tree{L,K}(params.max_depth)
     grow! = params.tree_type == "oblivious" ? grow_otree! : grow_tree!
     grow!(
         tree,
@@ -27,16 +27,16 @@ function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type
         cache.monotone_constraints,
     )
     push!(evotree.trees, tree)
-    predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
+    EvoTrees.predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
     cache[:info][:nrounds] += 1
     return nothing
 end
 
 # grow a single binary tree - grow through all depth
 function grow_tree!(
-    tree::Tree{L,K},
+    tree::EvoTrees.Tree{L,K},
     nodes::Vector{N},
-    params::EvoTypes{L},
+    params::EvoTrees.EvoTypes{L},
     ∇::CuMatrix,
     edges,
     js,
@@ -66,7 +66,7 @@ function grow_tree!(
 
     # initialize summary stats
     nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
-    nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
+    nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?
 
     # grow while there are remaining active nodes
     while length(n_current) > 0 && depth <= params.max_depth
@@ -90,14 +90,14 @@ function grow_tree!(
                     update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
-            @threads for n ∈ sort(n_current)
-                update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
+            Threads.@threads for n ∈ sort(n_current)
+                EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
             end
         end
 
         for n ∈ sort(n_current)
             if depth == params.max_depth || nodes[n].∑[end] <= params.min_weight
-                pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
             else
                 best = findmax(findmax.(nodes[n].gains))
                 best_gain = best[1][1]
@@ -126,8 +126,8 @@ function grow_tree!(
                     nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
                     nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
                     nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
-                    nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
-                    nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
+                    nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
+                    nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)
 
                     if length(_right) >= length(_left)
                         push!(n_next, n << 1)
@@ -137,7 +137,7 @@ function grow_tree!(
                         push!(n_next, n << 1)
                     end
                 else
-                    pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                    EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
                 end
             end
         end
@@ -151,9 +151,9 @@ end
 
 # grow a single oblivious tree - grow through all depth
 function grow_otree!(
-    tree::Tree{L,K},
+    tree::EvoTrees.Tree{L,K},
     nodes::Vector{N},
-    params::EvoTypes{L},
+    params::EvoTrees.EvoTypes{L},
     ∇::CuMatrix,
     edges,
     js,
@@ -183,7 +183,7 @@ function grow_otree!(
 
     # initialize summary stats
     nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
-    nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
+    nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?
 
     # grow while there are remaining active nodes
     while length(n_current) > 0 && depth <= params.max_depth
@@ -197,7 +197,7 @@ function grow_otree!(
         if depth == params.max_depth || min_weight_flag
             for n in n_current
                 # @info "length(nodes[n].is)" length(nodes[n].is) depth n
-                pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
             end
         else
             # update histograms
@@ -217,8 +217,8 @@ function grow_otree!(
                     update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
-            @threads for n ∈ n_current
-                update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
+            Threads.@threads for n ∈ n_current
+                EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
             end
 
             # initialize gains for node 1 in which all gains of a given depth will be accumulated
@@ -273,8 +273,8 @@ function grow_otree!(
                     nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
                     nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
                     nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
-                    nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
-                    nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
+                    nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
+                    nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)
 
                     if length(_right) >= length(_left)
                         push!(n_next, n << 1)
@@ -286,7 +286,7 @@ function grow_otree!(
                 end
             else
                 for n in n_current
-                    pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                    EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
                 end
             end
         end
@@ -295,4 +295,4 @@ function grow_otree!(
     end # end of loop over current nodes for a given depth
 
     return nothing
-end
+end