diff --git a/Project.toml b/Project.toml index 784acf1d..4c7c1365 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +EvoTreesCUDAExt = "CUDA" + [compat] BSON = "0.3" CUDA = "3.0, 4.0, 5.0" @@ -29,6 +35,7 @@ Tables = "1.9" julia = "1.6" [extras] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" @@ -37,4 +44,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] docs = ["Documenter"] -test = ["DataFrames", "Test", "MLJBase", "MLJTestInterface"] +test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"] diff --git a/docs/src/index.md b/docs/src/index.md index 88fbff39..9e73aa74 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -68,6 +68,12 @@ m = fit_evotree(config, dtrain; target_name="y", fnames=["x1", "x3"]); ### GPU Acceleration +EvoTrees supports training and inference on Nvidia GPU's with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl). +Note that on Julia ≥ 1.9 CUDA support is only enabled when CUDA.jl is installed and loaded, by another package or explicitly with e.g. +```julia +using CUDA +``` + If running on a CUDA enabled machine, training and inference on GPU can be triggered through the `device` kwarg: ```julia diff --git a/docs/src/internals.md b/docs/src/internals.md index cd1b6fe4..9de3999a 100644 --- a/docs/src/internals.md +++ b/docs/src/internals.md @@ -19,7 +19,6 @@ EvoTrees.update_gains! EvoTrees.predict! EvoTrees.subsample EvoTrees.split_set_chunk! -EvoTrees.split_chunk_kernel! ``` ## Histogram @@ -28,7 +27,4 @@ EvoTrees.split_chunk_kernel! EvoTrees.get_edges EvoTrees.binarize EvoTrees.update_hist! -EvoTrees.hist_kernel! -EvoTrees.hist_kernel_vec! -EvoTrees.predict_kernel! ``` diff --git a/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl b/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl new file mode 100644 index 00000000..8d0f7a8c --- /dev/null +++ b/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl @@ -0,0 +1,22 @@ +module EvoTreesCUDAExt + +using EvoTrees +using CUDA + +# This should be different on CPUs and GPUs +EvoTrees.device_ones(::Type{<:EvoTrees.GPU}, ::Type{T}, n::Int) where {T} = CUDA.ones(T, n) +EvoTrees.device_array_type(::Type{<:EvoTrees.GPU}) = CuArray +function EvoTrees.post_fit_gc(::Type{<:EvoTrees.GPU}) + GC.gc(true) + CUDA.reclaim() +end + +include("loss.jl") +include("eval.jl") +include("predict.jl") +include("init.jl") +include("subsample.jl") +include("fit-utils.jl") +include("fit.jl") + +end # module diff --git a/src/gpu/eval.jl b/ext/EvoTreesCUDAExt/eval.jl similarity index 80% rename from src/gpu/eval.jl rename to ext/EvoTreesCUDAExt/eval.jl index be12b94d..6ea10153 100644 --- a/src/gpu/eval.jl +++ b/ext/EvoTreesCUDAExt/eval.jl @@ -8,7 +8,7 @@ function eval_mse_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe end return nothing end -function mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_mse_kernel!(eval, p, y, w) @@ -19,8 +19,8 @@ end ######################## # RMSE ######################## -rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} = - sqrt(rmse(p, y, w; MAX_THREADS, kwargs...)) +EvoTrees.rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} = + sqrt(EvoTrees.rmse(p, y, w; MAX_THREADS, kwargs...)) ######################## # MAE @@ -32,7 +32,7 @@ function eval_mae_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe end return nothing end -function mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_mae_kernel!(eval, p, y, w) @@ -51,7 +51,7 @@ function eval_logloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:: end return nothing end -function logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_logloss_kernel!(eval, p, y, w) @@ -70,7 +70,7 @@ function eval_gaussian_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y: end return nothing end -function gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_gaussian_kernel!(eval, p, y, w) @@ -91,7 +91,7 @@ function eval_poisson_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:: return nothing end -function poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_poisson_kernel!(eval, p, y, w) @@ -111,7 +111,7 @@ function eval_gamma_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::Cu return nothing end -function gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_gamma_kernel!(eval, p, y, w) @@ -133,7 +133,7 @@ function eval_tweedie_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:: return nothing end -function tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_tweedie_kernel!(eval, p, y, w) @@ -158,10 +158,10 @@ function eval_mlogloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y: return nothing end -function mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} +function EvoTrees.mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads eval_mlogloss_kernel!(eval, p, y, w) CUDA.synchronize() return sum(eval) / sum(w) -end \ No newline at end of file +end diff --git a/src/gpu/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl similarity index 95% rename from src/gpu/fit-utils.jl rename to ext/EvoTreesCUDAExt/fit-utils.jl index d4d285b6..31bc1f55 100644 --- a/src/gpu/fit-utils.jl +++ b/ext/EvoTreesCUDAExt/fit-utils.jl @@ -1,6 +1,3 @@ -""" - hist_kernel! -""" function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S} tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x bdx, bdy = blockDim().z, blockDim().y @@ -48,9 +45,6 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) return nothing end -""" - hist_kernel_vec! -""" function hist_kernel_vec!(h∇, ∇, x_bin, is) tix, k = threadIdx().x, threadIdx().y bdx = blockDim().x @@ -103,10 +97,8 @@ function update_hist_gpu_vec!(h, h∇, ∇, x_bin, is, js::Vector) return nothing end -""" - Multi-threads split_set! - Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set. -""" +# Multi-threads split_set! +# Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set. function split_chunk_kernel!( left::CuDeviceVector{S}, right::CuDeviceVector{S}, @@ -149,7 +141,7 @@ function split_chunk_kernel!( return nothing end -function split_views_kernel!( +function EvoTrees.split_views_kernel!( out::CuDeviceVector{S}, left::CuDeviceVector{S}, right::CuDeviceVector{S}, @@ -208,7 +200,7 @@ function split_set_threads_gpu!(out, left, right, is, x_bin, feat, cond_bin, fea sum_lefts = sum(lefts) cumsum_lefts = cumsum(lefts) cumsum_rights = cumsum(rights) - @cuda blocks = nblocks threads = 1 split_views_kernel!( + @cuda blocks = nblocks threads = 1 EvoTrees.split_views_kernel!( out, left, right, diff --git a/src/gpu/fit.jl b/ext/EvoTreesCUDAExt/fit.jl similarity index 82% rename from src/gpu/fit.jl rename to ext/EvoTreesCUDAExt/fit.jl index b0935451..f0e32cf0 100644 --- a/src/gpu/fit.jl +++ b/ext/EvoTreesCUDAExt/fit.jl @@ -1,15 +1,15 @@ -function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type{GPU}) where {L,K} +function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}) where {L,K} # compute gradients - update_grads!(cache.∇, cache.pred, cache.y, params) + EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, params) # subsample rows cache.nodes[1].is = - subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng) + EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng) # subsample cols - sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true) + EvoTrees.sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true) # assign a root and grow tree - tree = Tree{L,K}(params.max_depth) + tree = EvoTrees.Tree{L,K}(params.max_depth) grow! = params.tree_type == "oblivious" ? grow_otree! : grow_tree! grow!( tree, @@ -27,16 +27,16 @@ function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type cache.monotone_constraints, ) push!(evotree.trees, tree) - predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu) + EvoTrees.predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu) cache[:info][:nrounds] += 1 return nothing end # grow a single binary tree - grow through all depth function grow_tree!( - tree::Tree{L,K}, + tree::EvoTrees.Tree{L,K}, nodes::Vector{N}, - params::EvoTypes{L}, + params::EvoTrees.EvoTypes{L}, ∇::CuMatrix, edges, js, @@ -66,7 +66,7 @@ function grow_tree!( # initialize summary stats nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2))) - nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version? + nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version? # grow while there are remaining active nodes while length(n_current) > 0 && depth <= params.max_depth @@ -90,14 +90,14 @@ function grow_tree!( update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end - @threads for n ∈ sort(n_current) - update_gains!(nodes[n], js, params, feattypes, monotone_constraints) + Threads.@threads for n ∈ sort(n_current) + EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints) end end for n ∈ sort(n_current) if depth == params.max_depth || nodes[n].∑[end] <= params.min_weight - pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) + EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) else best = findmax(findmax.(nodes[n].gains)) best_gain = best[1][1] @@ -126,8 +126,8 @@ function grow_tree!( nodes[n<<1].is, nodes[n<<1+1].is = _left, _right nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin] nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin] - nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑) - nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑) + nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑) + nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑) if length(_right) >= length(_left) push!(n_next, n << 1) @@ -137,7 +137,7 @@ function grow_tree!( push!(n_next, n << 1) end else - pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) + EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) end end end @@ -151,9 +151,9 @@ end # grow a single oblivious tree - grow through all depth function grow_otree!( - tree::Tree{L,K}, + tree::EvoTrees.Tree{L,K}, nodes::Vector{N}, - params::EvoTypes{L}, + params::EvoTrees.EvoTypes{L}, ∇::CuMatrix, edges, js, @@ -183,7 +183,7 @@ function grow_otree!( # initialize summary stats nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2))) - nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version? + nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version? # grow while there are remaining active nodes while length(n_current) > 0 && depth <= params.max_depth @@ -197,7 +197,7 @@ function grow_otree!( if depth == params.max_depth || min_weight_flag for n in n_current # @info "length(nodes[n].is)" length(nodes[n].is) depth n - pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) + EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) end else # update histograms @@ -217,8 +217,8 @@ function grow_otree!( update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end - @threads for n ∈ n_current - update_gains!(nodes[n], js, params, feattypes, monotone_constraints) + Threads.@threads for n ∈ n_current + EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints) end # initialize gains for node 1 in which all gains of a given depth will be accumulated @@ -273,8 +273,8 @@ function grow_otree!( nodes[n<<1].is, nodes[n<<1+1].is = _left, _right nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin] nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin] - nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑) - nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑) + nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑) + nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑) if length(_right) >= length(_left) push!(n_next, n << 1) @@ -286,7 +286,7 @@ function grow_otree!( end else for n in n_current - pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) + EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is) end end end @@ -295,4 +295,4 @@ function grow_otree!( end # end of loop over current nodes for a given depth return nothing -end \ No newline at end of file +end diff --git a/src/gpu/init.jl b/ext/EvoTreesCUDAExt/init.jl similarity index 69% rename from src/gpu/init.jl rename to ext/EvoTreesCUDAExt/init.jl index 3c3d682d..6a8dfcda 100644 --- a/src/gpu/init.jl +++ b/ext/EvoTreesCUDAExt/init.jl @@ -1,56 +1,56 @@ -function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, offset) where {L} +function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}, data, fnames, y_train, w, offset) where {L} # binarize data into quantiles - edges, featbins, feattypes = get_edges(data; fnames, nbins=params.nbins, rng=params.rng) - x_bin = CuArray(binarize(data; fnames, edges)) + edges, featbins, feattypes = EvoTrees.get_edges(data; fnames, nbins=params.nbins, rng=params.rng) + x_bin = CuArray(EvoTrees.binarize(data; fnames, edges)) nobs, nfeats = size(x_bin) T = Float32 target_levels = nothing - if L == Logistic + if L == EvoTrees.Logistic @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1 K = 1 y = T.(y_train) - μ = [logit(mean(y))] - !isnothing(offset) && (offset .= logit.(offset)) - elseif L in [Poisson, Gamma, Tweedie] + μ = [EvoTrees.logit(EvoTrees.mean(y))] + !isnothing(offset) && (offset .= EvoTrees.logit.(offset)) + elseif L in [EvoTrees.Poisson, EvoTrees.Gamma, EvoTrees.Tweedie] @assert eltype(y_train) <: Real K = 1 y = T.(y_train) - μ = fill(log(mean(y)), 1) + μ = fill(log(EvoTrees.mean(y)), 1) !isnothing(offset) && (offset .= log.(offset)) - elseif L == MLogLoss - if eltype(y_train) <: CategoricalValue - target_levels = CategoricalArrays.levels(y_train) - y = UInt32.(CategoricalArrays.levelcode.(y_train)) + elseif L == EvoTrees.MLogLoss + if eltype(y_train) <: EvoTrees.CategoricalValue + target_levels = EvoTrees.CategoricalArrays.levels(y_train) + y = UInt32.(EvoTrees.CategoricalArrays.levelcode.(y_train)) elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char target_levels = sort(unique(y_train)) - yc = CategoricalVector(y_train, levels=target_levels) - y = UInt32.(CategoricalArrays.levelcode.(yc)) + yc = EvoTrees.CategoricalVector(y_train, levels=target_levels) + y = UInt32.(EvoTrees.CategoricalArrays.levelcode.(yc)) else @error "Invalid target eltype: $(eltype(y_train))" end K = length(target_levels) - μ = T.(log.(proportions(y, UInt32(1):UInt32(K)))) + μ = T.(log.(EvoTrees.proportions(y, UInt32(1):UInt32(K)))) μ .-= maximum(μ) !isnothing(offset) && (offset .= log.(offset)) - elseif L == GaussianMLE + elseif L == EvoTrees.GaussianMLE @assert eltype(y_train) <: Real K = 2 y = T.(y_train) - μ = [mean(y), log(std(y))] + μ = [EvoTrees.mean(y), log(EvoTrees.std(y))] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) - elseif L == LogisticMLE + elseif L == EvoTrees.LogisticMLE @assert eltype(y_train) <: Real K = 2 y = T.(y_train) - μ = [mean(y), log(std(y) * sqrt(3) / π)] + μ = [EvoTrees.mean(y), log(EvoTrees.std(y) * sqrt(3) / π)] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) else @assert eltype(y_train) <: Real K = 1 y = T.(y_train) - μ = [mean(y)] + μ = [EvoTrees.mean(y)] end y = CuArray(y) μ = T.(μ) @@ -94,8 +94,8 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o ) # initialize model - nodes = [TrainNode(featbins, K, view(is_in, 1:0)) for n = 1:2^params.max_depth-1] - bias = [Tree{L,K}(μ)] + nodes = [EvoTrees.TrainNode(featbins, K, view(is_in, 1:0)) for n = 1:2^params.max_depth-1] + bias = [EvoTrees.Tree{L,K}(μ)] m = EvoTree{L,K}(bias, info) # build cache @@ -125,4 +125,4 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o monotone_constraints=monotone_constraints, ) return m, cache -end \ No newline at end of file +end diff --git a/src/gpu/loss.jl b/ext/EvoTreesCUDAExt/loss.jl similarity index 89% rename from src/gpu/loss.jl rename to ext/EvoTreesCUDAExt/loss.jl index bad91534..0f6f4e4d 100644 --- a/src/gpu/loss.jl +++ b/ext/EvoTreesCUDAExt/loss.jl @@ -9,13 +9,13 @@ function kernel_mse_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVect end return end -function update_grads!( +function EvoTrees.update_grads!( ∇::CuMatrix, p::CuMatrix, y::CuVector, - ::EvoTreeRegressor{L}; + ::EvoTreeRegressor{<:EvoTrees.MSE}; MAX_THREADS=1024 -) where {L<:MSE} +) threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads kernel_mse_∇!(∇, p, y) @@ -29,19 +29,19 @@ end function kernel_logloss_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVector) i = threadIdx().x + (blockIdx().x - 1) * blockDim().x if i <= length(y) - @inbounds pred = sigmoid(p[1, i]) + @inbounds pred = EvoTrees.sigmoid(p[1, i]) @inbounds ∇[1, i] = (pred - y[i]) * ∇[3, i] @inbounds ∇[2, i] = pred * (1 - pred) * ∇[3, i] end return end -function update_grads!( +function EvoTrees.update_grads!( ∇::CuMatrix, p::CuMatrix, y::CuVector, - ::EvoTreeRegressor{L}; + ::EvoTreeRegressor{<:EvoTrees.LogLoss}; MAX_THREADS=1024 -) where {L<:LogLoss} +) threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads kernel_logloss_∇!(∇, p, y) @@ -61,13 +61,13 @@ function kernel_poisson_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDevice end return end -function update_grads!( +function EvoTrees.update_grads!( ∇::CuMatrix, p::CuMatrix, y::CuVector, - ::EvoTreeCount{L}; + ::EvoTreeCount{<:EvoTrees.Poisson}; MAX_THREADS=1024 -) where {L<:Poisson} +) threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads kernel_poisson_∇!(∇, p, y) @@ -87,13 +87,13 @@ function kernel_gamma_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVe end return end -function update_grads!( +function EvoTrees.update_grads!( ∇::CuMatrix, p::CuMatrix, y::CuVector, - ::EvoTreeRegressor{L}; + ::EvoTreeRegressor{<:EvoTrees.Gamma}; MAX_THREADS=1024 -) where {L<:Gamma} +) threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads kernel_gamma_∇!(∇, p, y) @@ -115,13 +115,13 @@ function kernel_tweedie_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDevice end return end -function update_grads!( +function EvoTrees.update_grads!( ∇::CuMatrix, p::CuMatrix, y::CuVector, - ::EvoTreeRegressor{L}; + ::EvoTreeRegressor{<:EvoTrees.Tweedie}; MAX_THREADS=1024 -) where {L<:Tweedie} +) threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads kernel_tweedie_∇!(∇, p, y) @@ -154,13 +154,13 @@ function kernel_mlogloss_∇!(∇::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, y::C return end -function update_grads!( +function EvoTrees.update_grads!( ∇::CuMatrix, p::CuMatrix, y::CuVector, - ::EvoTreeClassifier{L}; + ::EvoTreeClassifier{<:EvoTrees.MLogLoss}; MAX_THREADS=1024 -) where {L<:MLogLoss} +) threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads kernel_mlogloss_∇!(∇, p, y) @@ -187,16 +187,16 @@ function kernel_gauss_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVe return end -function update_grads!( +function EvoTrees.update_grads!( ∇::CuMatrix, p::CuMatrix, y::CuVector, - ::Union{EvoTreeGaussian{L},EvoTreeMLE{L}}; + ::Union{EvoTreeGaussian{<:EvoTrees.GaussianMLE},EvoTreeMLE{<:EvoTrees.GaussianMLE}}; MAX_THREADS=1024 -) where {L<:GaussianMLE} +) threads = min(MAX_THREADS, length(y)) blocks = cld(length(y), threads) @cuda blocks = blocks threads = threads kernel_gauss_∇!(∇, p, y) CUDA.synchronize() return -end \ No newline at end of file +end diff --git a/src/gpu/predict.jl b/ext/EvoTreesCUDAExt/predict.jl similarity index 84% rename from src/gpu/predict.jl rename to ext/EvoTreesCUDAExt/predict.jl index 9cad7c3b..6ae81645 100644 --- a/src/gpu/predict.jl +++ b/ext/EvoTreesCUDAExt/predict.jl @@ -1,6 +1,3 @@ -""" - predict_kernel! -""" function predict_kernel!( ::Type{L}, pred::CuDeviceMatrix{T}, @@ -28,12 +25,9 @@ function predict_kernel!( return nothing end -""" - predict_kernel! - GradientRegression -""" +# GradientRegression function predict_kernel!( - ::Type{L}, + ::Type{<:EvoTrees.GradientRegression}, pred::CuDeviceMatrix{T}, split, feats, @@ -41,7 +35,7 @@ function predict_kernel!( leaf_pred, x_bin, feattypes, -) where {L<:GradientRegression,T} +) where {T} i = threadIdx().x + (blockIdx().x - 1) * blockDim().x nid = 1 @inbounds if i <= size(pred, 2) @@ -56,12 +50,9 @@ function predict_kernel!( return nothing end -""" - predict_kernel! - Logistic -""" +# Logistic function predict_kernel!( - ::Type{L}, + ::Type{<:EvoTrees.LogLoss}, pred::CuDeviceMatrix{T}, split, feats, @@ -69,7 +60,7 @@ function predict_kernel!( leaf_pred, x_bin, feattypes, -) where {L<:LogLoss,T} +) where {T} i = threadIdx().x + (blockIdx().x - 1) * blockDim().x nid = 1 @inbounds if i <= size(pred, 2) @@ -84,12 +75,9 @@ function predict_kernel!( return nothing end -""" - predict_kernel! - MLE2P -""" +# MLE2P function predict_kernel!( - ::Type{L}, + ::Type{<:EvoTrees.MLE2P}, pred::CuDeviceMatrix{T}, split, feats, @@ -97,7 +85,7 @@ function predict_kernel!( leaf_pred, x_bin, feattypes, -) where {L<:MLE2P,T} +) where {T} i = threadIdx().x + (blockIdx().x - 1) * blockDim().x nid = 1 @inbounds if i <= size(pred, 2) @@ -114,9 +102,9 @@ function predict_kernel!( end # prediction from single tree - assign each observation to its final leaf -function predict!( +function EvoTrees.predict!( pred::CuMatrix{T}, - tree::Tree{L,K}, + tree::EvoTrees.Tree{L,K}, x_bin::CuMatrix, feattypes::CuVector{Bool}; MAX_THREADS=1024 @@ -137,13 +125,13 @@ function predict!( CUDA.synchronize() end -function predict!( +function EvoTrees.predict!( pred::CuMatrix{T}, - tree::Tree{L,K}, + tree::EvoTrees.Tree{L,K}, x_bin::CuMatrix, feattypes::CuVector{Bool}; MAX_THREADS=1024 -) where {L<:MLogLoss,K,T} +) where {L<:EvoTrees.MLogLoss,K,T} n = size(pred, 2) threads = min(MAX_THREADS, n) blocks = cld(n, threads) @@ -165,25 +153,25 @@ end function predict( m::EvoTree{L,K}, data, - ::Type{GPU}; + ::Type{<:EvoTrees.GPU}; ntree_limit=length(m.trees)) where {L,K} pred = CUDA.zeros(K, size(data, 1)) ntrees = length(m.trees) ntree_limit > ntrees && error("ntree_limit is larger than number of trees $ntrees.") - x_bin = CuArray(binarize(data; fnames=m.info[:fnames], edges=m.info[:edges])) + x_bin = CuArray(EvoTrees.binarize(data; fnames=m.info[:fnames], edges=m.info[:edges])) feattypes = CuArray(m.info[:feattypes]) for i = 1:ntree_limit - predict!(pred, m.trees[i], x_bin, feattypes) + EvoTrees.predict!(pred, m.trees[i], x_bin, feattypes) end - if L == LogLoss - pred .= sigmoid.(pred) - elseif L ∈ [Poisson, Gamma, Tweedie] + if L == EvoTrees.LogLoss + pred .= EvoTrees.sigmoid.(pred) + elseif L ∈ [EvoTrees.Poisson, EvoTrees.Gamma, EvoTrees.Tweedie] pred .= exp.(pred) - elseif L in [GaussianMLE, LogisticMLE] + elseif L in [EvoTrees.GaussianMLE, EvoTrees.LogisticMLE] pred[2, :] .= exp.(pred[2, :]) - elseif L == MLogLoss - softmax!(pred) + elseif L == EvoTrees.MLogLoss + EvoTrees.softmax!(pred) end pred = K == 1 ? vec(Array(pred')) : Array(pred') return pred @@ -205,7 +193,7 @@ function softmax_kernel!(p::CuDeviceMatrix{T}) where {T} return nothing end -function softmax!(p::CuMatrix{T}; MAX_THREADS=1024) where {T} +function EvoTrees.softmax!(p::CuMatrix{T}; MAX_THREADS=1024) where {T} K, nobs = size(p) threads = min(MAX_THREADS, nobs) blocks = cld(nobs, threads) diff --git a/src/gpu/subsample.jl b/ext/EvoTreesCUDAExt/subsample.jl similarity index 95% rename from src/gpu/subsample.jl rename to ext/EvoTreesCUDAExt/subsample.jl index 2120347e..4328485f 100644 --- a/src/gpu/subsample.jl +++ b/ext/EvoTreesCUDAExt/subsample.jl @@ -51,7 +51,7 @@ function subsample_step_2_kernel(is_in, is_out, counts, counts_cum, chunk_size) sync_threads() end -function subsample(is_in::CuVector, is_out::CuVector, mask::CuVector, rowsample::AbstractFloat, rng) +function EvoTrees.subsample(is_in::CuVector, is_out::CuVector, mask::CuVector, rowsample::AbstractFloat, rng) get_rand_gpu!(mask) cond = round(UInt8, 255 * rowsample) chunk_size = cld(length(is_in), min(cld(length(is_in), 128), 2048)) diff --git a/src/EvoTrees.jl b/src/EvoTrees.jl index 9fb52b1a..62fa751d 100644 --- a/src/EvoTrees.jl +++ b/src/EvoTrees.jl @@ -18,8 +18,6 @@ using Distributions using Tables using CategoricalArrays using Tables -using CUDA -using CUDA: @allowscalar, allowscalar using BSON using NetworkLayout @@ -41,13 +39,9 @@ include("subsample.jl") include("fit-utils.jl") include("fit.jl") -include("gpu/loss.jl") -include("gpu/eval.jl") -include("gpu/predict.jl") -include("gpu/init.jl") -include("gpu/subsample.jl") -include("gpu/fit-utils.jl") -include("gpu/fit.jl") +if !isdefined(Base, :get_extension) + include("../ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl") +end include("callback.jl") include("importance.jl") diff --git a/src/callback.jl b/src/callback.jl index 6d12b4fe..5d9d6220 100644 --- a/src/callback.jl +++ b/src/callback.jl @@ -44,7 +44,8 @@ function CallBack( else y = T.(y_eval) end - w = isnothing(w_name) ? ones(T, size(y)) : Vector{T}(Tables.getcolumn(deval, _w_name)) + V = device_array_type(device) + w = isnothing(w_name) ? device_ones(device, T, length(y)) : V{T}(Tables.getcolumn(deval, _w_name)) offset = !isnothing(offset_name) ? T.(Tables.getcolumn(deval, _offset_name)) : nothing if !isnothing(offset) @@ -56,11 +57,7 @@ function CallBack( p .+= offset' end - if device <: GPU - return CallBack(feval, CuArray(x_bin), CuArray(p), CuArray(y), CuArray(w), CuArray(similar(w)), CuArray(m.info[:feattypes])) - else - return CallBack(feval, x_bin, p, y, w, similar(w), m.info[:feattypes]) - end + return CallBack(feval, convert(V, x_bin), convert(V, p), convert(V, y), w, similar(w), convert(V, m.info[:feattypes])) end function CallBack( @@ -92,7 +89,8 @@ function CallBack( else y = T.(y_eval) end - w = isnothing(w_eval) ? ones(T, size(y)) : Vector{T}(w_eval) + V = device_array_type(device) + w = isnothing(w_eval) ? device_ones(device, T, length(y)) : V{T}(w_eval) offset = !isnothing(offset_eval) ? T.(offset_eval) : nothing if !isnothing(offset) @@ -104,11 +102,7 @@ function CallBack( p .+= offset' end - if device <: GPU - return CallBack(feval, CuArray(x_bin), CuArray(p), CuArray(y), CuArray(w), CuArray(similar(w)), CuArray(m.info[:feattypes])) - else - return CallBack(feval, x_bin, p, y, w, similar(w), m.info[:feattypes]) - end + return CallBack(feval, convert(V, x_bin), convert(V, p), convert(V, y), w, similar(w), convert(V, m.info[:feattypes])) end function (cb::CallBack)(logger, iter, tree) @@ -149,4 +143,4 @@ function update_logger!(logger, iter, metric) logger[:iter_since_best] += logger[:iter][end] - logger[:iter][end-1] end end -end \ No newline at end of file +end diff --git a/src/fit.jl b/src/fit.jl index 4628b4f7..feb07d7a 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -397,10 +397,7 @@ function fit_evotree( (logger[:iter_since_best] >= logger[:early_stopping_rounds]) && break end end - if String(device) == "gpu" - GC.gc(true) - CUDA.reclaim() - end + post_fit_gc(_device) if return_logger return (m, logger) @@ -410,6 +407,8 @@ function fit_evotree( end +# A no-op on the CPU, but on the GPU we perform garbage collection +post_fit_gc(::Type{<:CPU}) = nothing """ fit_evotree( @@ -517,10 +516,7 @@ function fit_evotree( (logger[:iter_since_best] >= logger[:early_stopping_rounds]) && break end end - if _device <: GPU - GC.gc(true) - CUDA.reclaim() - end + post_fit_gc(_device) if return_logger return (m, logger) diff --git a/src/init.jl b/src/init.jl index f9930f97..1c77f6ee 100644 --- a/src/init.jl +++ b/src/init.jl @@ -173,19 +173,18 @@ function init( T = Float32 nobs = length(Tables.getcolumn(dtrain, 1)) y_train = Tables.getcolumn(dtrain, _target_name) - if device <: GPU - w = isnothing(w_name) ? CUDA.ones(T, nobs) : CuArray{T}(Tables.getcolumn(dtrain, _w_name)) - offset = !isnothing(offset_name) ? CuArray{T}(Tables.getcolumn(dtrain, _offset_name)) : nothing - else - w = isnothing(w_name) ? ones(T, nobs) : Vector{T}(Tables.getcolumn(dtrain, _w_name)) - offset = !isnothing(offset_name) ? T.(Tables.getcolumn(dtrain, _offset_name)) : nothing - end + V = device_array_type(device) + w = isnothing(w_name) ? device_ones(device, T, nobs) : V{T}(Tables.getcolumn(dtrain, _w_name)) + offset = isnothing(offset_name) ? nothing : V{T}(Tables.getcolumn(dtrain, _offset_name)) m, cache = init_core(params, device, dtrain, fnames, y_train, w, offset) return m, cache end +# This should be different on CPUs and GPUs +device_ones(::Type{<:CPU}, ::Type{T}, n::Int) where {T} = ones(T, n) +device_array_type(::Type{<:CPU}) = Array """ init( @@ -216,13 +215,9 @@ function init( T = Float32 nobs = size(x_train, 1) - if device <: GPU - w = isnothing(w_train) ? CUDA.ones(T, nobs) : CuArray{T}(w_train) - offset = !isnothing(offset_train) ? CuArray{T}(offset_train) : nothing - else - w = isnothing(w_train) ? ones(T, nobs) : Vector{T}(w_train) - offset = !isnothing(offset_train) ? T.(offset_train) : nothing - end + V = device_array_type(device) + w = isnothing(w_train) ? device_ones(device, T, nobs) : V{T}(w_train) + offset = isnothing(offset_train) ? nothing : V{T}(offset_train) m, cache = init_core(params, device, x_train, fnames, y_train, w, offset)